From f954a5c7053becec42a9fdc5cd4add64c16e98d6 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Tue, 3 Feb 2026 12:32:30 +0100 Subject: [PATCH 01/14] mlir backend: parameterized xdsl type --- src/xtc/backends/mlir/MlirGraphBackend.py | 17 ++++++++++------- src/xtc/backends/mlir/MlirNodeBackend.py | 14 ++++++++------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index 191cad027..c39fc5b2e 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -7,7 +7,7 @@ from xdsl.dialects.func import FuncOp as xdslFuncOp from xdsl.dialects import func, memref -from xdsl.dialects.builtin import MemRefType, f32, f64 +from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64 from xdsl.ir import Region, Block, Operation from xdsl.builder import ImplicitBuilder @@ -28,7 +28,9 @@ def __init__( concluding_passes: list[str] = [], always_vectorize: bool = False, no_alias: bool = True, + use_tensor_dialect: bool = False, ): + self.xdsl_type = TensorType if use_tensor_dialect else MemRefType if isinstance(xdsl_func, XTCGraph): assert nodes is None graph = xdsl_func @@ -128,6 +130,7 @@ def _init_from_graph( always_vectorize=always_vectorize, concluding_passes=concluding_passes, id=f"__xtc_id_{node_id}_", + xdsl_type=self.xdsl_type ) return payload, nodes_dict @@ -137,10 +140,10 @@ def _xdsl_elt_shape_from_tensortype(self, type: XTCTensorType) -> tuple[Any, Any def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any: elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) - return MemRefType(elt_type, shape) + return self.xdsl_type(elt_type, shape) def _np_types_spec( - self, types: list[MemRefType] + self, types: list[MemRefType | TensorType] ) -> list[dict[str, tuple[int, ...] | str]]: types_map = {"f32": "float32", "f64": "float64"} types_spec: list[dict[str, tuple[int, ...] | str]] = [ @@ -156,12 +159,12 @@ def _np_types_spec( def np_inputs_spec(self) -> list[dict[str, Any]]: # Assume inputs are first, and output is single last param inputs_args_types = [arg.type for arg in self.xdsl_func.args[:-1]] - list_memref_tys = cast(list[MemRefType], inputs_args_types) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types) + return self._np_types_spec(list_xdsl_tys) @override def np_outputs_spec(self) -> list[dict[str, Any]]: # Assume inputs are first, and output is single last param outputs_args_types = [arg.type for arg in self.xdsl_func.args[-1:]] - list_memref_tys = cast(list[MemRefType], outputs_args_types) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], outputs_args_types) + return self._np_types_spec(list_xdsl_tys) diff --git a/src/xtc/backends/mlir/MlirNodeBackend.py b/src/xtc/backends/mlir/MlirNodeBackend.py index 135e99b80..8ae0c97e7 100644 --- a/src/xtc/backends/mlir/MlirNodeBackend.py +++ b/src/xtc/backends/mlir/MlirNodeBackend.py @@ -6,7 +6,7 @@ from typing_extensions import override from xdsl.ir import Operation as xdslOperation -from xdsl.dialects.builtin import MemRefType as xdslAnyMemRefType +from xdsl.dialects.builtin import MemRefType, TensorType from xdsl.dialects.builtin import UnitAttr as xdslUnitAttr from xtc.utils.xdsl_aux import xdsl_operator_to_function @@ -26,8 +26,10 @@ def __init__( always_vectorize: bool = False, no_alias: bool = True, id: str | None = None, + xdsl_type: MemRefType | TensorType = MemRefType, ): self._graph = None + self.xdsl_type = xdsl_type if id is None: self.op_id_attribute = f"__id{MlirNodeBackend.count}__" MlirNodeBackend.count += 1 @@ -48,7 +50,7 @@ def __init__( self.loop_stamps = loop_stamps def _np_types_spec( - self, types: list[xdslAnyMemRefType] + self, types: list[MemRefType | TensorType] ) -> list[dict[str, tuple[int, ...] | str]]: types_map = {"f32": "float32", "f64": "float64"} types_spec: list[dict[str, tuple[int, ...] | str]] = [ @@ -63,11 +65,11 @@ def _np_types_spec( @override def np_inputs_spec(self) -> list[dict[str, Any]]: list_attr_tys = [i.type for i in self.source_op.inputs] # type: ignore - list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) + return self._np_types_spec(list_xdsl_tys) @override def np_outputs_spec(self) -> list[dict[str, Any]]: list_attr_tys = [i.type for i in self.source_op.outputs] # type: ignore - list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) + return self._np_types_spec(list_xdsl_tys) From 551b946dc239930e289efa76d01a329a1d1fea7b Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Wed, 4 Feb 2026 10:42:57 +0100 Subject: [PATCH 02/14] tensor: added matmul, tensor graph parsing --- src/xtc/backends/mlir/MlirGraphBackend.py | 10 +++- src/xtc/backends/mlir/MlirOps.py | 56 +++++++++++++------ src/xtc/utils/xdsl_aux.py | 8 ++- .../backends/test_matmul_mlir_tensor.py | 31 ++++++++++ 4 files changed, 83 insertions(+), 22 deletions(-) create mode 100644 tests/filecheck/backends/test_matmul_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index c39fc5b2e..ab2cc09c5 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -64,7 +64,7 @@ def _init_from_xdsl( def _xdsl_generate_node( self, node: XTCNode, block: Block, variables: dict[str, Any] ): - operation = MlirOperation.from_operation(node.operation, name=node.name) + operation = MlirOperation.from_operation(node.operation, name=node.name, op_type=self.xdsl_type) names = [*node.inputs, *node.outputs] assert node.inputs_types is not None and node.outputs_types is not None types = [*node.inputs_types, *node.outputs_types] @@ -108,13 +108,17 @@ def _init_from_graph( for node in graph.nodes.values(): node_attrs = self._xdsl_generate_node(node, inlined_block, variables) block_attrs.append(node_attrs) + return_val = block_attrs[-1]["nodes_map"]["return_node_id"] with ImplicitBuilder(inlined_block): - func.ReturnOp() + if return_val: + func.ReturnOp(return_val) + else: + func.ReturnOp() region = Region([inlined_block]) # type: ignore # issue with mypy payload = xdslFuncOp.from_region( name=graph.name, input_types=params_types, - return_types=[], + return_types=[params_types[-1]] if return_val else [], region=region, ) nodes_dict = {} diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index be687b444..14e5fb47f 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -7,9 +7,11 @@ from typing_extensions import override from typing import Any, Type, TypeAlias, cast -from xdsl.dialects import linalg, arith, builtin, memref +from xdsl.dialects import linalg, arith, builtin, memref, tensor from xdsl.dialects.builtin import ( MemRefType, + TensorType, + Sequence, f32, f64, i64, @@ -42,8 +44,9 @@ def __init__( args: tuple[Any, ...], attrs: dict[str, Any] = {}, name: str | None = None, + op_type: MemRefType | TensorType = MemRefType, ) -> None: - self.operator = operator(args, attrs, name=name) + self.operator = operator(args, attrs, name=name, op_type=op_type) self.args = args self.attrs = attrs self.name = self.operator.name if name is None else name @@ -78,7 +81,7 @@ def np_outputs_spec(self) -> list[dict[str, Any]]: return outputs_spec @classmethod - def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation": + def from_operation(cls, xtc_op: Operation, name: str | None, op_type: MemRefType | TensorType) -> "MlirOperation": dims = xtc_op.dims.values() dtype = xtc_op.inputs_types[0].dtype # TODO: currently get dtype from 1st arg args = tuple([*dims, dtype]) @@ -88,6 +91,7 @@ def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation": args, dict(attrs), name=name, + op_type=op_type, ) @@ -97,11 +101,12 @@ class MlirOperator(ABC): KINDS = "" def __init__( - self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None + self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None, op_type: MemRefType | TensorType = MemRefType ) -> None: self.args = args self.attrs = {**attrs} self.name = name if name is not None else self.DEFAULT_NAME + self.op_type = op_type @abstractmethod def generate_op( @@ -149,24 +154,42 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] + self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 3 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) - fill = linalg.FillOp( - res=(), - inputs=(cst0.results[0],), - outputs=(args[2],), - ) - reduce = linalg.MatmulOp( - res=(), - inputs=(args[0], args[1]), - outputs=(args[2],), - ) + + if self.op_type == MemRefType: + fill = linalg.FillOp( + res=(), + inputs=(cst0.results[0],), + outputs=(args[2],), + ) + reduce = linalg.MatmulOp( + res=(), + inputs=(args[0], args[1]), + outputs=(args[2],), + ) + else: + empty = tensor.EmptyOp( + dynamic_sizes=[], + tensor_type=args[2].type, + ) + fill = linalg.FillOp( + res=(empty.results[0].type,), + inputs=(cst0.results[0],), + outputs=(empty.results[0],), + ) + reduce = linalg.MatmulOp( + res=(args[2].type,), + inputs=(args[0], args[1]), + outputs=(fill.results[0],), + ) + fill_node_id = f"{self.name}_0" reduce_node_id = f"{self.name}" fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() @@ -175,6 +198,7 @@ def generate_op( "nodes_map": { fill_node_id: fill, reduce_node_id: reduce, + "return_node_id": reduce if self.op_type == TensorType else None, }, "dims_sizes": [ {"i": Ki, "j": Kj}, diff --git a/src/xtc/utils/xdsl_aux.py b/src/xtc/utils/xdsl_aux.py index c339f02d3..0061ccba5 100644 --- a/src/xtc/utils/xdsl_aux.py +++ b/src/xtc/utils/xdsl_aux.py @@ -12,6 +12,7 @@ from xdsl.dialects.arith import ConstantOp from xdsl.dialects.builtin import ( MemRefType, + TensorType, IntegerAttr, FloatAttr, IntegerType, @@ -19,7 +20,7 @@ from xdsl.context import Context from xdsl.parser import Parser -from xdsl.dialects import func, linalg, arith, memref +from xdsl.dialects import func, linalg, arith, memref, tensor from xdsl.dialects.builtin import ModuleOp @@ -29,6 +30,7 @@ def parse_xdsl_module(source: str) -> ModuleOp: context.load_dialect(linalg.Linalg) context.load_dialect(arith.Arith) context.load_dialect(memref.MemRef) + context.load_dialect(tensor.Tensor) parser = Parser(context, source) module = parser.parse_module() return module @@ -39,7 +41,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp: operands = source_op.operands shaped_types, scalar_types = [], [] for o in operands: - if isa(o.type, MemRefType): + if isa(o.type, MemRefType) or isa(o.type, TensorType): shaped_types.append(o.type) else: scalar_types.append(o.type) @@ -49,7 +51,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp: concrete_operands = [] shaped_count, scalar_count = 0, 0 for o in operands: - if isa(o.type, MemRefType): + if isa(o.type, MemRefType) or isa(o.type, TensorType): concrete_operands.append(payload.args[shaped_count]) shaped_count += 1 else: diff --git a/tests/filecheck/backends/test_matmul_mlir_tensor.py b/tests/filecheck/backends/test_matmul_mlir_tensor.py new file mode 100644 index 000000000..29d0392a8 --- /dev/null +++ b/tests/filecheck/backends/test_matmul_mlir_tensor.py @@ -0,0 +1,31 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C") + +graph = gb.graph +print(graph) + +impl = Backend(graph) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="matmul_mlir", + print_source_ir=True, + print_transformed_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") From 2800d44a54b7a049a679000673f1f3eec9652cc1 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Fri, 6 Feb 2026 12:17:33 +0100 Subject: [PATCH 03/14] tensor-dialect: added bufferization pass, single matmul ops --- src/xtc/backends/mlir/MlirCompiler.py | 16 +++ src/xtc/backends/mlir/MlirCompilerPasses.py | 15 +++ src/xtc/backends/mlir/MlirConfig.py | 1 + src/xtc/backends/mlir/MlirGraphBackend.py | 62 ++++++--- src/xtc/backends/mlir/MlirNodeBackend.py | 8 +- src/xtc/backends/mlir/MlirOps.py | 37 ++++-- .../tensor_dialect/test_matmul_mlir_tensor.py | 120 ++++++++++++++++++ .../backends/test_matmul_mlir_tensor.py | 31 ----- 8 files changed, 226 insertions(+), 64 deletions(-) create mode 100644 tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py delete mode 100644 tests/filecheck/backends/test_matmul_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index b4c9bfe34..9d3e55354 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -20,6 +20,7 @@ from xtc.backends.mlir.MlirCompilerPasses import ( MlirProgramInsertTransformPass, MlirProgramApplyTransformPass, + MlirProgramApplyPasses, ) from xtc.backends.mlir.MlirTarget import ( @@ -149,6 +150,19 @@ def mlir_apply_transform_pass(self) -> None: if self._config.print_transformed_ir: self.dump_ir("IR Dump After transform") + def mlir_apply_tensor_lowering_pass(self) -> None: + apply_transform_pass = MlirProgramApplyPasses( + mlir_program=self._mlir_program, + ) + apply_transform_pass.run( + [ + "eliminate-empty-tensors", # causes ops to write directly to out buffer + "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map}", + ] + ) + if self._config.print_bufferization_ir: + self.dump_ir("IR Dump After Lowering") + def _save_temp(self, fname: str, content: Any) -> None: if not self._config.save_temps: return @@ -190,6 +204,8 @@ def compile(self) -> None: save_temp(src_ir_dump_file, self._mlir_program.mlir_module) + self.mlir_apply_tensor_lowering_pass() + self.mlir_insert_transform_pass() save_temp(mlir_btrn_dump_file, self._mlir_program.mlir_module) diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index de33ff28d..7a7e2da7e 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -534,3 +534,18 @@ def run(self) -> None: transform_op.erase() else: break + + +class MlirProgramApplyPasses: + def __init__( + self, + mlir_program: RawMlirProgram, + ) -> None: + self._mlir_program = mlir_program + + def run(self, pass_names: list[str]) -> None: + ctx = self._mlir_program.mlir_context + pm = PassManager(context=ctx) + for name in pass_names: + pm.add(name) # type: ignore # no attribute add + pm.run(self._mlir_program.mlir_module.operation) diff --git a/src/xtc/backends/mlir/MlirConfig.py b/src/xtc/backends/mlir/MlirConfig.py index 2d0ab5128..653456815 100644 --- a/src/xtc/backends/mlir/MlirConfig.py +++ b/src/xtc/backends/mlir/MlirConfig.py @@ -22,6 +22,7 @@ class MlirConfig: print_assembly: bool = False visualize_jumps: bool = True print_lowered_ir: bool = False + print_bufferization_ir: bool = False debug: bool = False color: bool = False concluding_passes: list[str] = field(default_factory=list) diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index ab2cc09c5..3f88cc793 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -2,12 +2,12 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024-2026 The XTC Project Authors # -from typing import cast, Any +from typing import cast, Any, Type from typing_extensions import override from xdsl.dialects.func import FuncOp as xdslFuncOp -from xdsl.dialects import func, memref -from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64 +from xdsl.dialects import func, memref, bufferization +from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64, UnitAttr from xdsl.ir import Region, Block, Operation from xdsl.builder import ImplicitBuilder @@ -30,7 +30,9 @@ def __init__( no_alias: bool = True, use_tensor_dialect: bool = False, ): - self.xdsl_type = TensorType if use_tensor_dialect else MemRefType + self.xdsl_type: Type[TensorType] | Type[MemRefType] = ( + TensorType if use_tensor_dialect else MemRefType + ) if isinstance(xdsl_func, XTCGraph): assert nodes is None graph = xdsl_func @@ -64,7 +66,11 @@ def _init_from_xdsl( def _xdsl_generate_node( self, node: XTCNode, block: Block, variables: dict[str, Any] ): - operation = MlirOperation.from_operation(node.operation, name=node.name, op_type=self.xdsl_type) + operation = MlirOperation.from_operation( + node.operation, + name=node.name, + op_type=self.xdsl_type, # type: ignore + ) names = [*node.inputs, *node.outputs] assert node.inputs_types is not None and node.outputs_types is not None types = [*node.inputs_types, *node.outputs_types] @@ -81,7 +87,8 @@ def _xdsl_generate_node( variables[name] = alloca.results[0] args = [variables[name] for name in names] _, attrs = operation.generate(block=block, args=args) - return attrs + last_node = attrs["nodes_map"].get("return_node_id") + return attrs, last_node def _init_from_graph( self, @@ -97,28 +104,43 @@ def _init_from_graph( ) params_types = [ self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type)) - for tensor_type in [*inputs_types, *outputs_types] + for tensor_type in inputs_types # [*inputs_types, *outputs_types] ] + # graph output types are always memrefs + params_types.extend( + self._memref_type_from_tensortype(cast(XTCTensorType, tensor_type)) + for tensor_type in outputs_types + ) inlined_block = Block(arg_types=params_types) variables = { name: arg for name, arg in zip([*graph.inputs, *graph.outputs], inlined_block.args) } block_attrs = [] + last_node = None for node in graph.nodes.values(): - node_attrs = self._xdsl_generate_node(node, inlined_block, variables) + node_attrs, last_node = self._xdsl_generate_node( + node, inlined_block, variables + ) block_attrs.append(node_attrs) - return_val = block_attrs[-1]["nodes_map"]["return_node_id"] with ImplicitBuilder(inlined_block): - if return_val: - func.ReturnOp(return_val) - else: - func.ReturnOp() + if self.xdsl_type == TensorType: + assert last_node + reduce = bufferization.MaterializeInDestinationOp( + # operands=((inlined_block.args[-1],), (last_node.results[0],)), + operands=((last_node.results[0],), (inlined_block.args[-1],)), + # result_types=(last_node.results[0].type,), + # result_types=(inlined_block.args[-1].type,), + result_types=((),), + ) + reduce.attributes["writable"] = UnitAttr() + reduce.attributes["restrict"] = UnitAttr() + func.ReturnOp() region = Region([inlined_block]) # type: ignore # issue with mypy payload = xdslFuncOp.from_region( name=graph.name, input_types=params_types, - return_types=[params_types[-1]] if return_val else [], + return_types=[], region=region, ) nodes_dict = {} @@ -134,7 +156,7 @@ def _init_from_graph( always_vectorize=always_vectorize, concluding_passes=concluding_passes, id=f"__xtc_id_{node_id}_", - xdsl_type=self.xdsl_type + xdsl_type=self.xdsl_type, ) return payload, nodes_dict @@ -146,8 +168,12 @@ def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any: elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) return self.xdsl_type(elt_type, shape) + def _memref_type_from_tensortype(self, type: XTCTensorType) -> Any: + elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) + return MemRefType(elt_type, shape) + def _np_types_spec( - self, types: list[MemRefType | TensorType] + self, types: list[MemRefType] | list[TensorType] ) -> list[dict[str, tuple[int, ...] | str]]: types_map = {"f32": "float32", "f64": "float64"} types_spec: list[dict[str, tuple[int, ...] | str]] = [ @@ -163,12 +189,12 @@ def _np_types_spec( def np_inputs_spec(self) -> list[dict[str, Any]]: # Assume inputs are first, and output is single last param inputs_args_types = [arg.type for arg in self.xdsl_func.args[:-1]] - list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types) + list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types) # type: ignore return self._np_types_spec(list_xdsl_tys) @override def np_outputs_spec(self) -> list[dict[str, Any]]: # Assume inputs are first, and output is single last param outputs_args_types = [arg.type for arg in self.xdsl_func.args[-1:]] - list_xdsl_tys = cast(list[self.xdsl_type], outputs_args_types) + list_xdsl_tys = cast(list[MemRefType], outputs_args_types) return self._np_types_spec(list_xdsl_tys) diff --git a/src/xtc/backends/mlir/MlirNodeBackend.py b/src/xtc/backends/mlir/MlirNodeBackend.py index 8ae0c97e7..f809e392c 100644 --- a/src/xtc/backends/mlir/MlirNodeBackend.py +++ b/src/xtc/backends/mlir/MlirNodeBackend.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024-2026 The XTC Project Authors # -from typing import cast, Any +from typing import cast, Any, Type from typing_extensions import override from xdsl.ir import Operation as xdslOperation @@ -26,7 +26,7 @@ def __init__( always_vectorize: bool = False, no_alias: bool = True, id: str | None = None, - xdsl_type: MemRefType | TensorType = MemRefType, + xdsl_type: Type[TensorType] | Type[MemRefType] = MemRefType, ): self._graph = None self.xdsl_type = xdsl_type @@ -65,11 +65,11 @@ def _np_types_spec( @override def np_inputs_spec(self) -> list[dict[str, Any]]: list_attr_tys = [i.type for i in self.source_op.inputs] # type: ignore - list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) + list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) # type: ignore return self._np_types_spec(list_xdsl_tys) @override def np_outputs_spec(self) -> list[dict[str, Any]]: list_attr_tys = [i.type for i in self.source_op.outputs] # type: ignore - list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) + list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) # type: ignore return self._np_types_spec(list_xdsl_tys) diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index 14e5fb47f..3b4adada8 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -11,7 +11,6 @@ from xdsl.dialects.builtin import ( MemRefType, TensorType, - Sequence, f32, f64, i64, @@ -44,7 +43,7 @@ def __init__( args: tuple[Any, ...], attrs: dict[str, Any] = {}, name: str | None = None, - op_type: MemRefType | TensorType = MemRefType, + op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: self.operator = operator(args, attrs, name=name, op_type=op_type) self.args = args @@ -81,7 +80,12 @@ def np_outputs_spec(self) -> list[dict[str, Any]]: return outputs_spec @classmethod - def from_operation(cls, xtc_op: Operation, name: str | None, op_type: MemRefType | TensorType) -> "MlirOperation": + def from_operation( + cls, + xtc_op: Operation, + name: str | None, + op_type: Type[MemRefType] | Type[TensorType], + ) -> "MlirOperation": dims = xtc_op.dims.values() dtype = xtc_op.inputs_types[0].dtype # TODO: currently get dtype from 1st arg args = tuple([*dims, dtype]) @@ -101,7 +105,11 @@ class MlirOperator(ABC): KINDS = "" def __init__( - self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None, op_type: MemRefType | TensorType = MemRefType + self, + args: tuple[Any, ...], + attrs: dict[str, Any], + name: str | None = None, + op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: self.args = args self.attrs = {**attrs} @@ -154,12 +162,15 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] + self.op_type(elt_type, shape) + for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 3 - assert all(isinstance(arg.type, self.op_type) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args[:-1]) + # output arg is always a memref (for now) + assert isinstance(args[-1].type, MemRefType) with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) @@ -175,9 +186,10 @@ def generate_op( outputs=(args[2],), ) else: + out_tensor_type = TensorType(elt_type, [Ki, Kj]) empty = tensor.EmptyOp( dynamic_sizes=[], - tensor_type=args[2].type, + tensor_type=out_tensor_type, ) fill = linalg.FillOp( res=(empty.results[0].type,), @@ -185,11 +197,10 @@ def generate_op( outputs=(empty.results[0],), ) reduce = linalg.MatmulOp( - res=(args[2].type,), + res=(fill.results[0].type,), inputs=(args[0], args[1]), outputs=(fill.results[0],), ) - fill_node_id = f"{self.name}_0" reduce_node_id = f"{self.name}" fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() @@ -198,7 +209,7 @@ def generate_op( "nodes_map": { fill_node_id: fill, reduce_node_id: reduce, - "return_node_id": reduce if self.op_type == TensorType else None, + "return_node_id": reduce, }, "dims_sizes": [ {"i": Ki, "j": Kj}, @@ -247,7 +258,11 @@ class MlirOperatorConv2D(MlirOperator): DEFAULT_STRIDE = (1, 1) def __init__( - self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None + self, + args: tuple[Any, ...], + attrs: dict[str, Any], + name: str | None = None, + op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: attrs = {"stride": self.DEFAULT_STRIDE, **attrs} super().__init__(args, attrs, name) diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py new file mode 100644 index 000000000..d363536ee --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py @@ -0,0 +1,120 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="matmul_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>) +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_3 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_3 to %c32 step %c1_4 { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_4 = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_5 to %c32 step %c1_6 { +# CHECK-NEXT: %subview_7 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %subview_3[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_4[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_10 to %c512 step %c1_11 { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_8[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_14 = memref.subview %subview_9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 4x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 + diff --git a/tests/filecheck/backends/test_matmul_mlir_tensor.py b/tests/filecheck/backends/test_matmul_mlir_tensor.py deleted file mode 100644 index 29d0392a8..000000000 --- a/tests/filecheck/backends/test_matmul_mlir_tensor.py +++ /dev/null @@ -1,31 +0,0 @@ -# RUN: python %s 2>&1 | filecheck %s -# UNSUPPORTED: mlir-target=nvgpu - -import xtc.graphs.xtc.op as O -from xtc.backends.mlir import Backend - -I, J, K, dtype = 4, 32, 512, "float32" -a = O.tensor((I, K), dtype, name="A") -b = O.tensor((K, J), dtype, name="B") - -with O.graph(name="matmul") as gb: - O.matmul(a, b, name="C") - -graph = gb.graph -print(graph) - -impl = Backend(graph) - -sch = impl.get_scheduler() -sched = sch.schedule() - -comp = impl.get_compiler( - shared_lib=True, - dump_file="matmul_mlir", - print_source_ir=True, - print_transformed_ir=True, -) -module = comp.compile(sched) -executor = module.get_executor(validate=True) -res = executor.execute() -print(f"CODE: {res}") From 6dbe48eaab51f12500769defec838514ec594815 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Fri, 6 Feb 2026 14:27:22 +0100 Subject: [PATCH 04/14] tensor-dialect: support for multiple ops --- src/xtc/backends/mlir/MlirGraphBackend.py | 19 +- src/xtc/backends/mlir/MlirOps.py | 23 ++- .../test_two_matmuls_mlir_tensor.py | 181 ++++++++++++++++++ 3 files changed, 207 insertions(+), 16 deletions(-) create mode 100644 tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index 3f88cc793..d9aa9ed6c 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -6,7 +6,7 @@ from typing_extensions import override from xdsl.dialects.func import FuncOp as xdslFuncOp -from xdsl.dialects import func, memref, bufferization +from xdsl.dialects import func, memref, tensor, bufferization from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64, UnitAttr from xdsl.ir import Region, Block, Operation from xdsl.builder import ImplicitBuilder @@ -79,12 +79,19 @@ def _xdsl_generate_node( continue with ImplicitBuilder(block): elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) - alloca = memref.AllocaOp.get( - return_type=elt_type, - shape=shape, - alignment=256, # Take the default of dlpack lib + result_op = ( + tensor.EmptyOp( + dynamic_sizes=[], + tensor_type=self._xdsl_type_from_tensortype(type), + ) + if self.xdsl_type == TensorType + else memref.AllocaOp.get( + return_type=elt_type, + shape=shape, + alignment=256, # Take the default of dlpack lib + ) ) - variables[name] = alloca.results[0] + variables[name] = result_op.results[0] args = [variables[name] for name in names] _, attrs = operation.generate(block=block, args=args) last_node = attrs["nodes_map"].get("return_node_id") diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index 3b4adada8..088c4195e 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -162,15 +162,15 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - self.op_type(elt_type, shape) - for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] + self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj]] ] + ops_types.append(MemRefType(elt_type, [Ki, Kj])) block = Block(arg_types=ops_types) args = block.args + has_tensor_result = isinstance(args[-1].type, TensorType) assert len(args) == 3 assert all(isinstance(arg.type, self.op_type) for arg in args[:-1]) - # output arg is always a memref (for now) - assert isinstance(args[-1].type, MemRefType) + assert not (has_tensor_result and self.op_type == MemRefType) with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) @@ -186,15 +186,18 @@ def generate_op( outputs=(args[2],), ) else: - out_tensor_type = TensorType(elt_type, [Ki, Kj]) - empty = tensor.EmptyOp( - dynamic_sizes=[], - tensor_type=out_tensor_type, + empty = ( + args[2] + if has_tensor_result + else tensor.EmptyOp( + dynamic_sizes=[], + tensor_type=TensorType(elt_type, [Ki, Kj]), + ).results[0] ) fill = linalg.FillOp( - res=(empty.results[0].type,), + res=(empty.type,), inputs=(cst0.results[0],), - outputs=(empty.results[0],), + outputs=(empty,), ) reduce = linalg.MatmulOp( res=(fill.results[0].type,), diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py new file mode 100644 index 000000000..a7ea1b96a --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -0,0 +1,181 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") +c = O.tensor((J, I), dtype, name="C") + +with O.graph(name="matmul") as gb: + d = O.matmul(a, b, name="D") + O.matmul(c, d, name="E") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) +#impl = Backend(graph, use_tensor_dialect=False) + +sch = impl.get_scheduler(default_node = "E") +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="two_matmul_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>) +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) +# CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_D_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_D_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_E_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_E_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./k" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0 to %c4 step %c1 { +# CHECK-NEXT: %subview = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 { +# CHECK-NEXT: %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_10 = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 { +# CHECK-NEXT: %subview_14 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c512 step %c1_18 { +# CHECK-NEXT: %subview_19 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_20 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_21 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%subview_19, %subview_20 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_4 to %c32 step %c1_5 { +# CHECK-NEXT: %subview = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 { +# CHECK-NEXT: %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c32_7 = arith.constant 32 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %alloc[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_10 = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 { +# CHECK-NEXT: %subview_14 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, %arg5] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c4_18 = arith.constant 4 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c4_18 step %c1_19 { +# CHECK-NEXT: %subview_20 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_21 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%subview_20, %subview_21 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_22 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: - %2 : 32x4xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %4 : 32x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %3: matmul(%0, %1) {name = 'D'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: - %4: matmul(%2, %3) {name = 'E'} : [32x4xfloat32, 4x32xfloat32] -> [32x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 + From 89dd2f54d374919ddcf9bd385aed9b8b2481eb8f Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Fri, 6 Feb 2026 17:16:06 +0100 Subject: [PATCH 05/14] tensor-dialect: cleanup, added lowering check to tests --- src/xtc/backends/mlir/MlirCompiler.py | 4 +- src/xtc/backends/mlir/MlirGraphBackend.py | 19 ++++---- src/xtc/backends/mlir/MlirOps.py | 48 ++++++------------- .../tensor_dialect/test_matmul_mlir_tensor.py | 26 +++++++++- .../test_two_matmuls_mlir_tensor.py | 34 ++++++++++++- 5 files changed, 87 insertions(+), 44 deletions(-) diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index 9d3e55354..babec3e33 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -154,6 +154,8 @@ def mlir_apply_tensor_lowering_pass(self) -> None: apply_transform_pass = MlirProgramApplyPasses( mlir_program=self._mlir_program, ) + if self._config.print_bufferization_ir: + self.dump_ir("IR Dump Before Tensor Lowering") apply_transform_pass.run( [ "eliminate-empty-tensors", # causes ops to write directly to out buffer @@ -161,7 +163,7 @@ def mlir_apply_tensor_lowering_pass(self) -> None: ] ) if self._config.print_bufferization_ir: - self.dump_ir("IR Dump After Lowering") + self.dump_ir("IR Dump After Tensor Lowering") def _save_temp(self, fname: str, content: Any) -> None: if not self._config.save_temps: diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index d9aa9ed6c..e6236ce53 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -75,6 +75,12 @@ def _xdsl_generate_node( assert node.inputs_types is not None and node.outputs_types is not None types = [*node.inputs_types, *node.outputs_types] for name, type in zip(names, types): + if name in node.outputs and self.xdsl_type == TensorType: + with ImplicitBuilder(block): + variables[name] = tensor.EmptyOp( + dynamic_sizes=[], + tensor_type=self._xdsl_type_from_tensortype(type), + ).results[0] if name in variables: continue with ImplicitBuilder(block): @@ -82,7 +88,7 @@ def _xdsl_generate_node( result_op = ( tensor.EmptyOp( dynamic_sizes=[], - tensor_type=self._xdsl_type_from_tensortype(type), + tensor_type=TensorType(elt_type, shape), ) if self.xdsl_type == TensorType else memref.AllocaOp.get( @@ -111,7 +117,7 @@ def _init_from_graph( ) params_types = [ self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type)) - for tensor_type in inputs_types # [*inputs_types, *outputs_types] + for tensor_type in inputs_types ] # graph output types are always memrefs params_types.extend( @@ -133,15 +139,12 @@ def _init_from_graph( with ImplicitBuilder(inlined_block): if self.xdsl_type == TensorType: assert last_node - reduce = bufferization.MaterializeInDestinationOp( - # operands=((inlined_block.args[-1],), (last_node.results[0],)), + # write the final tensor value to the output buffer + dest = bufferization.MaterializeInDestinationOp( operands=((last_node.results[0],), (inlined_block.args[-1],)), - # result_types=(last_node.results[0].type,), - # result_types=(inlined_block.args[-1].type,), result_types=((),), + attributes={"writable": UnitAttr(), "restrict": UnitAttr()}, ) - reduce.attributes["writable"] = UnitAttr() - reduce.attributes["restrict"] = UnitAttr() func.ReturnOp() region = Region([inlined_block]) # type: ignore # issue with mypy payload = xdslFuncOp.from_region( diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index 088c4195e..c645f0f41 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -7,7 +7,7 @@ from typing_extensions import override from typing import Any, Type, TypeAlias, cast -from xdsl.dialects import linalg, arith, builtin, memref, tensor +from xdsl.dialects import linalg, arith, builtin, memref from xdsl.dialects.builtin import ( MemRefType, TensorType, @@ -164,7 +164,7 @@ def generate_op( ops_types = [ self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj]] ] - ops_types.append(MemRefType(elt_type, [Ki, Kj])) + ops_types.append(TensorType(elt_type, [Ki, Kj])) block = Block(arg_types=ops_types) args = block.args has_tensor_result = isinstance(args[-1].type, TensorType) @@ -173,37 +173,19 @@ def generate_op( assert not (has_tensor_result and self.op_type == MemRefType) with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) - - if self.op_type == MemRefType: - fill = linalg.FillOp( - res=(), - inputs=(cst0.results[0],), - outputs=(args[2],), - ) - reduce = linalg.MatmulOp( - res=(), - inputs=(args[0], args[1]), - outputs=(args[2],), - ) - else: - empty = ( - args[2] - if has_tensor_result - else tensor.EmptyOp( - dynamic_sizes=[], - tensor_type=TensorType(elt_type, [Ki, Kj]), - ).results[0] - ) - fill = linalg.FillOp( - res=(empty.type,), - inputs=(cst0.results[0],), - outputs=(empty,), - ) - reduce = linalg.MatmulOp( - res=(fill.results[0].type,), - inputs=(args[0], args[1]), - outputs=(fill.results[0],), - ) + result = (args[2].type,) if self.op_type == TensorType else () + fill = linalg.FillOp( + res=result, + inputs=(cst0.results[0],), + outputs=(args[2],), + ) + reduce = linalg.MatmulOp( + res=result, + inputs=(args[0], args[1]), + outputs=(fill.results[0],) + if self.op_type == TensorType + else (args[2],), + ) fill_node_id = f"{self.name}_0" reduce_node_id = f"{self.name}" fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py index d363536ee..5a7ed668e 100644 --- a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py @@ -24,12 +24,36 @@ dump_file="matmul_mlir_tensor", print_source_ir=True, print_transformed_ir=True, + print_bufferization_ir=True, ) module = comp.compile(sched) executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before transform //----- // +# CHECK: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %2 = linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>) +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py index a7ea1b96a..7846515e9 100644 --- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -27,12 +27,44 @@ dump_file="two_matmul_mlir_tensor", print_source_ir=True, print_transformed_ir=True, + print_bufferization_ir=True, ) module = comp.compile(sched) executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before transform //----- // +# CHECK: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %2 = linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%3 : tensor<32x32xf32>) -> tensor<32x32xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %0 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>) +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) +# CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { # CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> From f5a8c01d1587afd0ef03a93a16a14e66aebf2721 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Mon, 9 Feb 2026 15:58:29 +0100 Subject: [PATCH 06/14] tensor-dialect: added relu with collapsing shapes --- src/xtc/backends/mlir/MlirCompiler.py | 1 + src/xtc/backends/mlir/MlirOps.py | 57 +++-- .../test_matmul_relu_mlir_tensor.py | 204 ++++++++++++++++++ .../test_two_matmuls_mlir_tensor.py | 24 +-- 4 files changed, 260 insertions(+), 26 deletions(-) create mode 100644 tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index babec3e33..85d7d543b 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -160,6 +160,7 @@ def mlir_apply_tensor_lowering_pass(self) -> None: [ "eliminate-empty-tensors", # causes ops to write directly to out buffer "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map}", + "func.func(promote-buffers-to-stack)", ] ) if self._config.print_bufferization_ir: diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index c645f0f41..e919d713b 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -7,7 +7,7 @@ from typing_extensions import override from typing import Any, Type, TypeAlias, cast -from xdsl.dialects import linalg, arith, builtin, memref +from xdsl.dialects import linalg, arith, builtin, memref, tensor from xdsl.dialects.builtin import ( MemRefType, TensorType, @@ -391,13 +391,14 @@ def generate_op( elt_type = {"float32": f32, "float64": f64}[dtype] elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: - ops_types = [MemRefType(elt_type, shape) for shape in [[Ki], [Ki]]] + ops_types = [self.op_type(elt_type, shape) for shape in [[Ki], [Ki]]] block = Block(arg_types=ops_types) args = block.args assert len(args) == 2 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) inp_shape, out_shape = [ - list(cast(MemRefType, arg.type).get_shape()) for arg in args + list(cast(self.op_type, arg.type).get_shape()) # type: ignore + for arg in args ] inp_size, out_size = [mulall(shape) for shape in [inp_shape, out_shape]] assert inp_size == out_size @@ -416,15 +417,32 @@ def generate_op( ) ] ) - inp = memref.CollapseShapeOp( - operands=[args[0]], - properties=dict(reassociation=inp_reassociation), - result_types=[MemRefType(elt_type, (inp_size,))], - ) - out = memref.CollapseShapeOp( - operands=[args[1]], - properties=dict(reassociation=out_reassociation), - result_types=[MemRefType(elt_type, (out_size,))], + if self.op_type == TensorType: + inp = tensor.CollapseShapeOp( # type: ignore + operands=[args[0]], + properties=dict(reassociation=inp_reassociation), + result_types=[self.op_type(elt_type, (inp_size,))], + ) + # create empty tensor for collapsed output shape + out_empty = tensor.EmptyOp([], TensorType(elt_type, [out_size])) + out_operand = out_empty.tensor + else: + inp = memref.CollapseShapeOp( # type: ignore + operands=[args[0]], + properties=dict(reassociation=inp_reassociation), + result_types=[self.op_type(elt_type, (inp_size,))], + ) + out = memref.CollapseShapeOp( + operands=[args[1]], + properties=dict(reassociation=out_reassociation), + result_types=[self.op_type(elt_type, (out_size,))], + ) + out_operand = out.results[0] + + result = ( + (TensorType(elt_type, [out_size]),) + if self.op_type == TensorType + else () ) cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) iterator_types = [ @@ -436,7 +454,7 @@ def generate_op( linalg.YieldOp(max) relu = linalg.GenericOp( inputs=(inp.results[0], cst0.results[0]), - outputs=(out.results[0],), + outputs=(out_operand,), body=Region([block_in]), # type: ignore # mypy issue with dataclass # ignore typing due to xdsl hints limitation indexing_maps=[ @@ -460,12 +478,23 @@ def generate_op( ), ], iterator_types=iterator_types, + result_types=result, ) + relu_result = None + if self.op_type == TensorType: + relu_result = tensor.ExpandShapeOp( + relu.results[0], + reassociation=out_reassociation, + result_type=TensorType(elt_type, out_shape), + static_output_shape=out_shape, + dynamic_output_shape=[], + ) relu_node_id = f"{self.name}" relu.attributes[f"__xtc_id_{relu_node_id}_"] = UnitAttr() attrs = { "nodes_map": { relu_node_id: relu, + "return_node_id": relu_result, }, "dims_sizes": [ self.dims_sizes(), diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py new file mode 100644 index 000000000..76379eb5c --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py @@ -0,0 +1,204 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul_relu") as gb: + m = O.matmul(a, b, name="matmul") + O.relu(m, name="relu") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler(default_node="matmul") +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="matmul_relu_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %5 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapsed, %cst_0 : tensor<128xf32>, f32) outs(%4 : tensor<128xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): +# CHECK-NEXT: %6 = arith.maximumf %in, %in_1 : f32 +# CHECK-NEXT: linalg.yield %6 : f32 +# CHECK-NEXT: } -> tensor<128xf32> +# CHECK-NEXT: %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) +# CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): +# CHECK-NEXT: %0 = arith.maximumf %in, %in_2 : f32 +# CHECK-NEXT: linalg.yield %0 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> +# CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) +# CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): +# CHECK-NEXT: %0 = arith.maximumf %in, %in_2 : f32 +# CHECK-NEXT: linalg.yield %0 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> +# CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_matmul_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { +# CHECK-NEXT: %subview = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_7 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_7 to %c32 step %c1_8 { +# CHECK-NEXT: %subview_9 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_9 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_8 = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_9 to %c32 step %c1_10 { +# CHECK-NEXT: %subview_11 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_8[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_14 to %c512 step %c1_15 { +# CHECK-NEXT: %subview_16 = memref.subview %subview_11[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_17 = memref.subview %subview_12[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_18 = memref.subview %subview_13[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%subview_16, %subview_17 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_18 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> +# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 64 : i64} : memref<128xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c128 = arith.constant 128 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_5 to %c128 step %c1_6 { +# CHECK-NEXT: %subview = memref.subview %collapse_shape[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %alloca_3[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_4 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_7 : memref<1xf32, strided<[1], offset: ?>>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_8: f32, %out: f32): +# CHECK-NEXT: %0 = arith.maximumf %in, %in_8 : f32 +# CHECK-NEXT: linalg.yield %0 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %expand_shape = memref.expand_shape %alloca_3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> +# CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul_relu +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %3 : 4x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'matmul'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: - %3: relu(%2) {name = 'relu'} : [4x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 + diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py index 7846515e9..efb890a66 100644 --- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -52,13 +52,13 @@ # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // # CHECK-NEXT: module { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>) +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) # CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> # CHECK-NEXT: return # CHECK-NEXT: } @@ -67,13 +67,13 @@ # CHECK-NEXT: // -----// IR Dump Before transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>) +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) # CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> # CHECK-NEXT: return # CHECK-NEXT: } @@ -113,13 +113,13 @@ # CHECK-NEXT: // -----// IR Dump After transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: scf.for %arg4 = %c0 to %c4 step %c1 { -# CHECK-NEXT: %subview = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %c0_9 = arith.constant 0 : index # CHECK-NEXT: %c32_10 = arith.constant 32 : index # CHECK-NEXT: %c1_11 = arith.constant 1 : index @@ -134,7 +134,7 @@ # CHECK-NEXT: scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 { # CHECK-NEXT: %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> # CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c32_12 = arith.constant 32 : index # CHECK-NEXT: %c1_13 = arith.constant 1 : index @@ -172,7 +172,7 @@ # CHECK-NEXT: %c1_8 = arith.constant 1 : index # CHECK-NEXT: scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 { # CHECK-NEXT: %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %alloc[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_9 = memref.subview %alloca[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>> # CHECK-NEXT: %subview_10 = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c32_12 = arith.constant 32 : index From cbb5303bee8044b19c2bd4e2ba2595ea5323ad73 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Tue, 10 Feb 2026 11:44:42 +0100 Subject: [PATCH 07/14] tensor-dialect: added conv2d --- src/xtc/backends/mlir/MlirOps.py | 23 +- .../test_conv2d_mini_mlir_tensor.py | 233 ++++++++++++++++++ 2 files changed, 246 insertions(+), 10 deletions(-) create mode 100644 tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index e919d713b..e309149fe 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -162,15 +162,13 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj]] + self.op_type(elt_type, shape) + for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] ] - ops_types.append(TensorType(elt_type, [Ki, Kj])) block = Block(arg_types=ops_types) args = block.args - has_tensor_result = isinstance(args[-1].type, TensorType) assert len(args) == 3 - assert all(isinstance(arg.type, self.op_type) for arg in args[:-1]) - assert not (has_tensor_result and self.op_type == MemRefType) + assert all(isinstance(arg.type, self.op_type) for arg in args) with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) result = (args[2].type,) if self.op_type == TensorType else () @@ -250,7 +248,7 @@ def __init__( op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: attrs = {"stride": self.DEFAULT_STRIDE, **attrs} - super().__init__(args, attrs, name) + super().__init__(args, attrs, name, op_type) @override def dims(self, kind: str = "") -> tuple[str, ...]: @@ -274,16 +272,17 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) for shape in [*inps_dims, out_dims] + self.op_type(elt_type, shape) for shape in [*inps_dims, out_dims] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 3 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) with ImplicitBuilder(block): + result = (args[2].type,) if self.op_type == TensorType else () cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) fill = linalg.FillOp( - res=(), + res=result, inputs=(cst0.results[0],), outputs=(args[2],), ) @@ -306,7 +305,9 @@ def generate_op( linalg.YieldOp(add) reduce = linalg.GenericOp( inputs=(args[0], args[1]), - outputs=(args[2],), + outputs=(fill.results[0],) + if self.op_type == TensorType + else (args[2],), body=Region([block_in]), # type: ignore # mypy issue with dataclass # ignore typing due to xdsl hints limitation indexing_maps=[ @@ -330,6 +331,7 @@ def generate_op( ), ], iterator_types=iterator_types, + result_types=result, ) fill_node_id = f"{self.name}_0" reduce_node_id = f"{self.name}" @@ -339,6 +341,7 @@ def generate_op( "nodes_map": { fill_node_id: fill, reduce_node_id: reduce, + "return_node_id": reduce, }, "dims_sizes": [ {"b": Kb, "h": Kh, "w": Kw, "f": Kf}, diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py new file mode 100644 index 000000000..89e6a0e18 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py @@ -0,0 +1,233 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +# Small conv2d +N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 3, 3, 3, 1, 1, "float32" +a = O.tensor((N, H + R - 1, W + S - 1, C), dtype, name="I") +b = O.tensor((R, S, C, F), dtype, name="W") + +with O.graph(name="conv2d_nhwc_mini") as gb: + O.conv2d(a, b, stride=(SH, SW), name="O") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="conv2d_nhwc_mini_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x8x8x16xf32>) -> tensor<1x8x8x16xf32> +# CHECK-NEXT: %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x10x10x3xf32>, tensor<3x3x3x16xf32>) outs(%1 : tensor<1x8x8x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): +# CHECK-NEXT: %3 = arith.mulf %in, %in_0 : f32 +# CHECK-NEXT: %4 = arith.addf %out, %3 : f32 +# CHECK-NEXT: linalg.yield %4 : f32 +# CHECK-NEXT: } -> tensor<1x8x8x16xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>) +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): +# CHECK-NEXT: %0 = arith.mulf %in, %in_0 : f32 +# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 +# CHECK-NEXT: linalg.yield %1 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>) +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): +# CHECK-NEXT: %0 = arith.mulf %in, %in_0 : f32 +# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 +# CHECK-NEXT: linalg.yield %1 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./f" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./f" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./c" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c1 step %c1_0 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_4 to %c8 step %c1_5 { +# CHECK-NEXT: %subview_6 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_7 = arith.constant 0 : index +# CHECK-NEXT: %c8_8 = arith.constant 8 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_7 to %c8_8 step %c1_9 { +# CHECK-NEXT: %subview_10 = memref.subview %subview_6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_11 to %c16 step %c1_12 { +# CHECK-NEXT: %subview_13 = memref.subview %subview_10[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> +# CHECK-NEXT: %subview_5 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_6 to %c8 step %c1_7 { +# CHECK-NEXT: %subview_8 = memref.subview %subview[0, %arg4, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c8_12 = arith.constant 8 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 { +# CHECK-NEXT: %subview_14 = memref.subview %subview_8[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c16 step %c1_18 { +# CHECK-NEXT: %subview_19 = memref.subview %subview_14[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_20 = memref.subview %subview_15[0, 0, 0, %arg6] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_21 = memref.subview %subview_16[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg7 = %c0_22 to %c3 step %c1_23 { +# CHECK-NEXT: %subview_24 = memref.subview %subview_19[0, %arg7, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_25 = memref.subview %subview_20[%arg7, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_26 = memref.subview %subview_21[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_27 = arith.constant 0 : index +# CHECK-NEXT: %c3_28 = arith.constant 3 : index +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg8 = %c0_27 to %c3_28 step %c1_29 { +# CHECK-NEXT: %subview_30 = memref.subview %subview_24[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_31 = memref.subview %subview_25[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_32 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_33 = arith.constant 0 : index +# CHECK-NEXT: %c3_34 = arith.constant 3 : index +# CHECK-NEXT: %c1_35 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg9 = %c0_33 to %c3_34 step %c1_35 { +# CHECK-NEXT: %subview_36 = memref.subview %subview_30[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_37 = memref.subview %subview_31[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_38 = memref.subview %subview_32[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_36, %subview_37 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_39: f32, %out: f32): +# CHECK-NEXT: %0 = arith.mulf %in, %in_39 : f32 +# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 +# CHECK-NEXT: linalg.yield %1 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: conv2d_nhwc_mini +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 1x10x10x3xfloat32 +# CHECK-NEXT: - %1 : 3x3x3x16xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 1x8x8x16xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: conv2d(%0, %1, stride=(1, 1)) {name = 'O'} : [1x10x10x3xfloat32, 3x3x3x16xfloat32] -> [1x8x8x16xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 + From 3759374b80c84615942836aa56a5b0d03449deff Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Tue, 10 Feb 2026 16:25:18 +0100 Subject: [PATCH 08/14] tensor-dialect: pad, unpad, memory alignment change --- src/xtc/backends/mlir/MlirCompiler.py | 2 +- src/xtc/backends/mlir/MlirOps.py | 76 +++-- .../test_matmul_relu_mlir_tensor.py | 13 +- .../test_pad_conv2d_mlir_tensor.py | 298 ++++++++++++++++++ .../test_pad_matmul_unpad_mlir_tensor.py | 242 ++++++++++++++ .../test_two_matmuls_mlir_tensor.py | 8 +- 6 files changed, 598 insertions(+), 41 deletions(-) create mode 100644 tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py create mode 100644 tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index 85d7d543b..7535f31a5 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -159,7 +159,7 @@ def mlir_apply_tensor_lowering_pass(self) -> None: apply_transform_pass.run( [ "eliminate-empty-tensors", # causes ops to write directly to out buffer - "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map}", + "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map buffer-alignment=256}", "func.func(promote-buffers-to-stack)", ] ) diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index e309149fe..1ad47bc91 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -566,7 +566,7 @@ def generate_op( block = Block(arg_types=ops_types) args = block.args assert len(args) == 2 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) if isinstance(padding, dict): offsets = [0 for _ in self.args[:-1]] for i, (pad_b, pad_a) in padding.items(): @@ -577,23 +577,33 @@ def generate_op( strides = [1 for _ in self.args[:-1]] with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size)) + result = (args[1].type,) if self.op_type == TensorType else () fill = linalg.FillOp( - res=(), + res=result, inputs=(cst0.results[0],), outputs=(args[1],), ) - subview = memref.SubviewOp.from_static_parameters( - source=args[1], - source_type=args[1].type, # type: ignore - offsets=offsets, - sizes=sizes, - strides=strides, - ) - copy = linalg.CopyOp( - inputs=[args[0]], - outputs=[subview.result], - res=(), - ) + if self.op_type == TensorType: + copy = tensor.InsertSliceOp.from_static_parameters( + source=args[0], + dest=fill.results[0], + offsets=offsets, + sizes=sizes, + strides=strides, + ) + else: + subview = memref.SubviewOp.from_static_parameters( + source=args[1], + source_type=args[1].type, # type: ignore + offsets=offsets, + sizes=sizes, + strides=strides, + ) + copy = linalg.CopyOp( # type: ignore + inputs=[args[0]], + outputs=[subview.result], + res=result, + ) fill_node_id = f"{self.name}_0" fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() copy_node_id = f"{self.name}" @@ -602,6 +612,7 @@ def generate_op( "nodes_map": { fill_node_id: fill, copy_node_id: copy, + "return_node_id": copy, }, "dims_sizes": [ self.dims_sizes(), @@ -674,13 +685,13 @@ def generate_op( elt_type = {"float32": f32, "float64": f64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) + self.op_type(elt_type, shape) for shape in [dims_values_before_unpad, dims_values] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 2 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) if isinstance(padding, dict): offsets = [0 for _ in self.args[:-1]] for i, (pad_b, _) in padding.items(): @@ -690,23 +701,32 @@ def generate_op( sizes = dims_values strides = [1 for _ in self.args[:-1]] with ImplicitBuilder(block): - subview = memref.SubviewOp.from_static_parameters( - source=args[0], - source_type=args[0].type, # type: ignore - offsets=offsets, - sizes=sizes, - strides=strides, - ) - copy = linalg.CopyOp( - inputs=[subview.result], - outputs=[args[1]], - res=(), - ) + if self.op_type == TensorType: + copy = tensor.ExtractSliceOp.from_static_parameters( + source=args[0], + offsets=offsets, + sizes=sizes, + strides=strides, + ) + else: + subview = memref.SubviewOp.from_static_parameters( + source=args[0], + source_type=args[0].type, # type: ignore + offsets=offsets, + sizes=sizes, + strides=strides, + ) + copy = linalg.CopyOp( # type: ignore + inputs=[subview.result], + outputs=[args[1]], + res=(), + ) copy_node_id = f"{self.name}" copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr() attrs = { "nodes_map": { copy_node_id: copy, + "return_node_id": copy, }, "dims_sizes": [ self.dims_sizes(), diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py index 76379eb5c..037bc5f53 100644 --- a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py @@ -60,12 +60,12 @@ # CHECK-NEXT: #map1 = affine_map<(d0) -> ()> # CHECK-NEXT: module { # CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> # CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs = {__xtc_id_relu_} { # CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): @@ -83,12 +83,12 @@ # CHECK-NEXT: #map1 = affine_map<(d0) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> # CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs = {__xtc_id_relu_} { # CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): @@ -128,7 +128,7 @@ # CHECK-NEXT: #map1 = affine_map<(d0) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index @@ -169,7 +169,7 @@ # CHECK-NEXT: } {"./j"} # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 64 : i64} : memref<128xf32> +# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> # CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_5 = arith.constant 0 : index # CHECK-NEXT: %c128 = arith.constant 128 : index @@ -201,4 +201,3 @@ # CHECK-NEXT: - %3: relu(%2) {name = 'relu'} : [4x32xfloat32] -> [4x32xfloat32] # CHECK-NEXT: # CHECK-NEXT: CODE: 0 - diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py new file mode 100644 index 000000000..68c2c7617 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py @@ -0,0 +1,298 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +# Small conv2d +N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 5, 5, 3, 2, 2, "float32" +a = O.tensor((N, H, W, C), dtype, name="I") +b = O.tensor((R, S, C, F), dtype, name="W") + +with O.graph(name="pad_conv2d_nhwc_mini") as gb: + p = O.pad2d(a, padding=2, axes=(1, 2), name="pad") + O.conv2d(p, b, stride=(SH, SW), name="conv") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="pad_conv2d_nhwc_mini_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%0 : tensor<1x12x12x3xf32>) -> tensor<1x12x12x3xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %3 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%2 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> +# CHECK-NEXT: %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%0, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): +# CHECK-NEXT: %5 = arith.mulf %in, %in_1 : f32 +# CHECK-NEXT: %6 = arith.addf %out, %5 : f32 +# CHECK-NEXT: linalg.yield %6 : f32 +# CHECK-NEXT: } -> tensor<1x4x4x16xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>) +# CHECK-NEXT: %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>) +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): +# CHECK-NEXT: %0 = arith.mulf %in, %in_1 : f32 +# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 +# CHECK-NEXT: linalg.yield %1 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>) +# CHECK-NEXT: %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>) +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): +# CHECK-NEXT: %0 = arith.mulf %in, %in_1 : f32 +# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 +# CHECK-NEXT: linalg.yield %1 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./c" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./c" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_21 "./f" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_23 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_25 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_27 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_29 "./f" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_31 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_33 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_35 "./c" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c1 step %c1_0 { +# CHECK-NEXT: %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_9 to %c12 step %c1_10 { +# CHECK-NEXT: %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c12_13 = arith.constant 12 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_12 to %c12_13 step %c1_14 { +# CHECK-NEXT: %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_16 to %c3 step %c1_17 { +# CHECK-NEXT: %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>) +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 { +# CHECK-NEXT: %subview_8 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_9 to %c4 step %c1_10 { +# CHECK-NEXT: %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c4_13 = arith.constant 4 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_12 to %c4_13 step %c1_14 { +# CHECK-NEXT: %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_16 to %c16 step %c1_17 { +# CHECK-NEXT: %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 { +# CHECK-NEXT: %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> +# CHECK-NEXT: %subview_10 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_11 to %c4 step %c1_12 { +# CHECK-NEXT: %0 = affine.apply #map(%arg4) +# CHECK-NEXT: %subview_13 = memref.subview %subview_8[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_14 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_10[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c4_17 = arith.constant 4 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_16 to %c4_17 step %c1_18 { +# CHECK-NEXT: %1 = affine.apply #map(%arg5) +# CHECK-NEXT: %subview_19 = memref.subview %subview_13[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_20 = memref.subview %subview_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> +# CHECK-NEXT: %subview_21 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_22 to %c16 step %c1_23 { +# CHECK-NEXT: %subview_24 = memref.subview %subview_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_25 = memref.subview %subview_20[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_26 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_27 = arith.constant 0 : index +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c1_28 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg7 = %c0_27 to %c5 step %c1_28 { +# CHECK-NEXT: %subview_29 = memref.subview %subview_24[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_30 = memref.subview %subview_25[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_31 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_32 = arith.constant 0 : index +# CHECK-NEXT: %c5_33 = arith.constant 5 : index +# CHECK-NEXT: %c1_34 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg8 = %c0_32 to %c5_33 step %c1_34 { +# CHECK-NEXT: %subview_35 = memref.subview %subview_29[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_36 = memref.subview %subview_30[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_37 = memref.subview %subview_31[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_38 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_39 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg9 = %c0_38 to %c3 step %c1_39 { +# CHECK-NEXT: %subview_40 = memref.subview %subview_35[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_41 = memref.subview %subview_36[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_42 = memref.subview %subview_37[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_41 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_42 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_43: f32, %out: f32): +# CHECK-NEXT: %2 = arith.mulf %in, %in_43 : f32 +# CHECK-NEXT: %3 = arith.addf %out, %2 : f32 +# CHECK-NEXT: linalg.yield %3 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: pad_conv2d_nhwc_mini +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 1x8x8x3xfloat32 +# CHECK-NEXT: - %1 : 5x5x3x16xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %3 : 1x4x4x16xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: pad2d(%0, padding={1: (2, 2), 2: (2, 2)}, constant_value=0) {name = 'pad'} : [1x8x8x3xfloat32] -> [1x12x12x3xfloat32] +# CHECK-NEXT: - %3: conv2d(%2, %1, stride=(2, 2)) {name = 'conv'} : [1x12x12x3xfloat32, 5x5x3x16xfloat32] -> [1x4x4x16xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py new file mode 100644 index 000000000..34c09d96e --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py @@ -0,0 +1,242 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 14, 14, 14, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="pad_matmul_unpad") as gb: + p1 = O.pad(a, padding=(0, 2), name="A_pad") + p2 = O.pad(b, padding=(0, 2), name="B_pad") + m_pad = O.matmul(p1, p2, name="matmul_padded") + O.unpad(m_pad, padding=(0, 2), name="C") +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) +sch = impl.get_scheduler(default_node="matmul_padded") +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="gen_pad_tuple_matmul_unpad_mlir", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%0 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %3 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %inserted_slice_1 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %5 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%4 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%0, %2 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>) +# CHECK-NEXT: %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>) +# CHECK-NEXT: %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>) +# CHECK-NEXT: %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>) +# CHECK-NEXT: %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>) +# CHECK-NEXT: %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>) +# CHECK-NEXT: %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./j" : !transform.any_op +# CHECK-NEXT: %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./j" : !transform.any_op +# CHECK-NEXT: %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_21 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_23 "./k" : !transform.any_op +# CHECK-NEXT: %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_25 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_27 "./j" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c16 step %c1 { +# CHECK-NEXT: %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 { +# CHECK-NEXT: %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c16_3 = arith.constant 16 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 { +# CHECK-NEXT: %subview_15 = memref.subview %alloca_0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 { +# CHECK-NEXT: %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %subview_5 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg1, %subview_5 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %cst_7 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c16_9 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 { +# CHECK-NEXT: %subview_15 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 { +# CHECK-NEXT: %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c16_12 = arith.constant 16 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 { +# CHECK-NEXT: %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>> +# CHECK-NEXT: %subview_17 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index +# CHECK-NEXT: %c16_19 = arith.constant 16 : index +# CHECK-NEXT: %c1_20 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_18 to %c16_19 step %c1_20 { +# CHECK-NEXT: %subview_21 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %subview_16[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %subview_17[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %c16_25 = arith.constant 16 : index +# CHECK-NEXT: %c1_26 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_24 to %c16_25 step %c1_26 { +# CHECK-NEXT: %subview_27 = memref.subview %subview_21[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_28 = memref.subview %subview_22[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_23[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %subview_14 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %subview_14, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: pad_matmul_unpad +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 14x14xfloat32 +# CHECK-NEXT: - %1 : 14x14xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %5 : 14x14xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: pad(%0, padding=(0, 2), constant_value=0) {name = 'A_pad'} : [14x14xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: - %3: pad(%1, padding=(0, 2), constant_value=0) {name = 'B_pad'} : [14x14xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: - %4: matmul(%2, %3) {name = 'matmul_padded'} : [16x16xfloat32, 16x16xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: - %5: unpad(%4, padding=(0, 2)) {name = 'C'} : [16x16xfloat32] -> [14x14xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py index efb890a66..c748923b8 100644 --- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -17,7 +17,6 @@ print(graph) impl = Backend(graph, use_tensor_dialect=True) -#impl = Backend(graph, use_tensor_dialect=False) sch = impl.get_scheduler(default_node = "E") sched = sch.schedule() @@ -52,7 +51,7 @@ # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // # CHECK-NEXT: module { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) @@ -67,7 +66,7 @@ # CHECK-NEXT: // -----// IR Dump Before transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) # CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) @@ -113,7 +112,7 @@ # CHECK-NEXT: // -----// IR Dump After transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index @@ -210,4 +209,3 @@ # CHECK-NEXT: - %4: matmul(%2, %3) {name = 'E'} : [32x4xfloat32, 4x32xfloat32] -> [32x32xfloat32] # CHECK-NEXT: # CHECK-NEXT: CODE: 0 - From 58ffe404a0591d0af452ee27bf3b651d68d81695 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Wed, 11 Feb 2026 13:50:17 +0100 Subject: [PATCH 09/14] tensor-dialect: changed to pass mac tests, moved bufferize pass --- src/xtc/backends/mlir/MlirCompiler.py | 15 ++++----------- src/xtc/backends/mlir/MlirCompilerPasses.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index 7535f31a5..6f3538e68 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -20,7 +20,7 @@ from xtc.backends.mlir.MlirCompilerPasses import ( MlirProgramInsertTransformPass, MlirProgramApplyTransformPass, - MlirProgramApplyPasses, + apply_bufferization_passes, ) from xtc.backends.mlir.MlirTarget import ( @@ -151,18 +151,11 @@ def mlir_apply_transform_pass(self) -> None: self.dump_ir("IR Dump After transform") def mlir_apply_tensor_lowering_pass(self) -> None: - apply_transform_pass = MlirProgramApplyPasses( - mlir_program=self._mlir_program, - ) if self._config.print_bufferization_ir: self.dump_ir("IR Dump Before Tensor Lowering") - apply_transform_pass.run( - [ - "eliminate-empty-tensors", # causes ops to write directly to out buffer - "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map buffer-alignment=256}", - "func.func(promote-buffers-to-stack)", - ] - ) + + apply_bufferization_passes(self._mlir_program) + if self._config.print_bufferization_ir: self.dump_ir("IR Dump After Tensor Lowering") diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index 7a7e2da7e..dc228c157 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -26,6 +26,7 @@ OpResult, ) from mlir.passmanager import PassManager +import platform # Import SDist if available try: @@ -549,3 +550,22 @@ def run(self, pass_names: list[str]) -> None: for name in pass_names: pm.add(name) # type: ignore # no attribute add pm.run(self._mlir_program.mlir_module.operation) + + +def apply_bufferization_passes(mlir_program: RawMlirProgram): + apply_passes = MlirProgramApplyPasses(mlir_program) + bufferize_options = [ + "bufferize-function-boundaries=1", + "function-boundary-type-conversion=identity-layout-map", + "buffer-alignment=256", + ] + # needed for now because macos mlir version needs to be updated + if platform.system() != "Darwin": + bufferize_options.append("buffer-alignment=256") + apply_passes.run( + [ + "eliminate-empty-tensors", # causes ops to write directly to out buffer + f"one-shot-bufferize{{{' '.join(bufferize_options)}}}", + "func.func(promote-buffers-to-stack)", + ] + ) From e2c88ce14e7abbb0dc01a18ac06ef07993832db5 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Fri, 13 Feb 2026 11:59:41 +0100 Subject: [PATCH 10/14] tensor-dialect: moved bufferization to after transform --- src/xtc/backends/mlir/MlirCompiler.py | 4 +- src/xtc/backends/mlir/MlirCompilerPasses.py | 11 +- src/xtc/backends/mlir/MlirGraphBackend.py | 24 +- src/xtc/backends/mlir/MlirOps.py | 18 +- .../test_conv2d_mini_mlir_tensor.py | 433 ++++++++--- .../test_conv2d_r181_mlir_tensor.py | 723 ++++++++++++++++++ .../tensor_dialect/test_matmul_mlir_tensor.py | 197 +++-- .../test_matmul_relu_mlir_tensor.py | 398 +++++++--- .../test_pad_conv2d_mlir_tensor.py | 609 +++++++++++---- .../test_pad_matmul_unpad_mlir_tensor.py | 379 ++++++--- .../test_two_matmuls_mlir_tensor.py | 361 +++++++-- 11 files changed, 2560 insertions(+), 597 deletions(-) create mode 100644 tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index 6f3538e68..8a761f87d 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -200,12 +200,12 @@ def compile(self) -> None: save_temp(src_ir_dump_file, self._mlir_program.mlir_module) - self.mlir_apply_tensor_lowering_pass() - self.mlir_insert_transform_pass() save_temp(mlir_btrn_dump_file, self._mlir_program.mlir_module) self.mlir_apply_transform_pass() save_temp(mlir_atrn_dump_file, self._mlir_program.mlir_module) + self.mlir_apply_tensor_lowering_pass() + self._target.generate_code_for_target(self._mlir_program, dump_file=dump_file) diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index dc228c157..a52e6aa21 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -557,15 +557,22 @@ def apply_bufferization_passes(mlir_program: RawMlirProgram): bufferize_options = [ "bufferize-function-boundaries=1", "function-boundary-type-conversion=identity-layout-map", - "buffer-alignment=256", ] - # needed for now because macos mlir version needs to be updated + # TODO: below is needed until macos mlir is updated if platform.system() != "Darwin": bufferize_options.append("buffer-alignment=256") apply_passes.run( [ "eliminate-empty-tensors", # causes ops to write directly to out buffer f"one-shot-bufferize{{{' '.join(bufferize_options)}}}", + "func.func(buffer-hoisting)", + "func.func(buffer-loop-hoisting)", + "drop-equivalent-buffer-results", "func.func(promote-buffers-to-stack)", ] ) + + +def pre_transform_tensor_passes(mlir_program: RawMlirProgram): + apply_passes = MlirProgramApplyPasses(mlir_program) + # apply_passes.run(["eliminate-empty-tensors"]) diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index e6236ce53..0c05ebd16 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -83,24 +83,23 @@ def _xdsl_generate_node( ).results[0] if name in variables: continue + assert self.xdsl_type != TensorType with ImplicitBuilder(block): elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) - result_op = ( - tensor.EmptyOp( - dynamic_sizes=[], - tensor_type=TensorType(elt_type, shape), - ) - if self.xdsl_type == TensorType - else memref.AllocaOp.get( - return_type=elt_type, - shape=shape, - alignment=256, # Take the default of dlpack lib - ) + alloca = memref.AllocaOp.get( + return_type=elt_type, + shape=shape, + alignment=256, # Take the default of dlpack lib ) - variables[name] = result_op.results[0] + variables[name] = alloca.results[0] args = [variables[name] for name in names] _, attrs = operation.generate(block=block, args=args) last_node = attrs["nodes_map"].get("return_node_id") + # the tensor dialect needs the result of the op, not the alloca + if self.xdsl_type == TensorType: + # for name in node.outputs: + assert len(node.outputs) == 1 + variables[node.outputs[0]] = last_node.results[0] return attrs, last_node def _init_from_graph( @@ -131,6 +130,7 @@ def _init_from_graph( } block_attrs = [] last_node = None + for node in graph.nodes.values(): node_attrs, last_node = self._xdsl_generate_node( node, inlined_block, variables diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index 1ad47bc91..df8a4aba0 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -575,15 +575,16 @@ def generate_op( offsets = [padding[0] for _ in self.args[:-1]] sizes = list(dims_value_before_pad) strides = [1 for _ in self.args[:-1]] + using_tensors = self.op_type == TensorType with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size)) - result = (args[1].type,) if self.op_type == TensorType else () + result = (args[1].type,) if using_tensors else () fill = linalg.FillOp( res=result, inputs=(cst0.results[0],), outputs=(args[1],), ) - if self.op_type == TensorType: + if using_tensors: copy = tensor.InsertSliceOp.from_static_parameters( source=args[0], dest=fill.results[0], @@ -611,12 +612,12 @@ def generate_op( attrs = { "nodes_map": { fill_node_id: fill, - copy_node_id: copy, + copy_node_id: None if using_tensors else copy, "return_node_id": copy, }, "dims_sizes": [ self.dims_sizes(), - self.dims_sizes(), + *([] if using_tensors else [self.dims_sizes()]), ], } return block, attrs @@ -700,8 +701,9 @@ def generate_op( offsets = [padding[0] for _ in self.args[:-1]] sizes = dims_values strides = [1 for _ in self.args[:-1]] + using_tensors = self.op_type == TensorType with ImplicitBuilder(block): - if self.op_type == TensorType: + if using_tensors: copy = tensor.ExtractSliceOp.from_static_parameters( source=args[0], offsets=offsets, @@ -725,12 +727,10 @@ def generate_op( copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr() attrs = { "nodes_map": { - copy_node_id: copy, + copy_node_id: None if using_tensors else copy, "return_node_id": copy, }, - "dims_sizes": [ - self.dims_sizes(), - ], + "dims_sizes": [*([] if using_tensors else [self.dims_sizes()])], } return block, attrs diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py index 89e6a0e18..bd8db60bd 100644 --- a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py @@ -31,11 +31,12 @@ executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before Tensor Lowering //----- // + +# CHECK: // -----// IR Dump Before transform //----- // # CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> # CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> -# CHECK-NEXT: module { +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 @@ -49,44 +50,6 @@ # CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> -# CHECK-NEXT: module { -# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>) -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): -# CHECK-NEXT: %0 = arith.mulf %in, %in_0 : f32 -# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 -# CHECK-NEXT: linalg.yield %1 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> -# CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>) -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): -# CHECK-NEXT: %0 = arith.mulf %in, %in_0 : f32 -# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 -# CHECK-NEXT: linalg.yield %1 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { # CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op # CHECK-NEXT: transform.yield @@ -125,96 +88,373 @@ # CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c1_0 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0 to %c1 step %c1_0 { -# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> # CHECK-NEXT: %c0_4 = arith.constant 0 : index # CHECK-NEXT: %c8 = arith.constant 8 : index # CHECK-NEXT: %c1_5 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_4 to %c8 step %c1_5 { -# CHECK-NEXT: %subview_6 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> # CHECK-NEXT: %c0_7 = arith.constant 0 : index # CHECK-NEXT: %c8_8 = arith.constant 8 : index # CHECK-NEXT: %c1_9 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_7 to %c8_8 step %c1_9 { -# CHECK-NEXT: %subview_10 = memref.subview %subview_6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_12 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_11 to %c16 step %c1_12 { -# CHECK-NEXT: %subview_13 = memref.subview %subview_10[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32> # CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32> # CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32> # CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> # CHECK-NEXT: } {"./b"} # CHECK-NEXT: %c0_1 = arith.constant 0 : index # CHECK-NEXT: %c1_2 = arith.constant 1 : index # CHECK-NEXT: %c1_3 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 { -# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> -# CHECK-NEXT: %subview_5 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c8_12 = arith.constant 8 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_25 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_30 = arith.constant 0 : index +# CHECK-NEXT: %c3_31 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %c3_38 = arith.constant 3 : index +# CHECK-NEXT: %c1_39 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_45: f32, %out: f32): +# CHECK-NEXT: %10 = arith.mulf %in, %in_45 : f32 +# CHECK-NEXT: %11 = arith.addf %out, %10 : f32 +# CHECK-NEXT: linalg.yield %11 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> +# CHECK-NEXT: %c0_7 = arith.constant 0 : index +# CHECK-NEXT: %c8_8 = arith.constant 8 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> # CHECK-NEXT: %c0_6 = arith.constant 0 : index # CHECK-NEXT: %c8 = arith.constant 8 : index # CHECK-NEXT: %c1_7 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_6 to %c8 step %c1_7 { -# CHECK-NEXT: %subview_8 = memref.subview %subview[0, %arg4, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c8_12 = arith.constant 8 : index # CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 { -# CHECK-NEXT: %subview_14 = memref.subview %subview_8[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> -# CHECK-NEXT: %subview_16 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c16 step %c1_18 { -# CHECK-NEXT: %subview_19 = memref.subview %subview_14[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_20 = memref.subview %subview_15[0, 0, 0, %arg6] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_21 = memref.subview %subview_16[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index # CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_23 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg7 = %c0_22 to %c3 step %c1_23 { -# CHECK-NEXT: %subview_24 = memref.subview %subview_19[0, %arg7, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_25 = memref.subview %subview_20[%arg7, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_26 = memref.subview %subview_21[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_27 = arith.constant 0 : index -# CHECK-NEXT: %c3_28 = arith.constant 3 : index -# CHECK-NEXT: %c1_29 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg8 = %c0_27 to %c3_28 step %c1_29 { -# CHECK-NEXT: %subview_30 = memref.subview %subview_24[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_31 = memref.subview %subview_25[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_32 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_33 = arith.constant 0 : index -# CHECK-NEXT: %c3_34 = arith.constant 3 : index -# CHECK-NEXT: %c1_35 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg9 = %c0_33 to %c3_34 step %c1_35 { -# CHECK-NEXT: %subview_36 = memref.subview %subview_30[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_37 = memref.subview %subview_31[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_38 = memref.subview %subview_32[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_36, %subview_37 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_39: f32, %out: f32): -# CHECK-NEXT: %0 = arith.mulf %in, %in_39 : f32 -# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 -# CHECK-NEXT: linalg.yield %1 : f32 +# CHECK-NEXT: %c1_25 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_30 = arith.constant 0 : index +# CHECK-NEXT: %c3_31 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %c3_38 = arith.constant 3 : index +# CHECK-NEXT: %c1_39 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_45: f32, %out: f32): +# CHECK-NEXT: %10 = arith.mulf %in, %in_45 : f32 +# CHECK-NEXT: %11 = arith.addf %out, %10 : f32 +# CHECK-NEXT: linalg.yield %11 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %subview) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c8_9 = arith.constant 8 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg7 = %c0_8 to %c8_9 step %c1_10 iter_args(%arg8 = %subview_7) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg9 = %c0_13 to %c16 step %c1_14 iter_args(%arg10 = %subview_12) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_16 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) +# CHECK-NEXT: %subview_17 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_16, %subview_17 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_15 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_15 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_11 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %subview_5) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_9 = memref.subview %subview[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c8_13 = arith.constant 8 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg7 = %c0_12 to %c8_13 step %c1_14 iter_args(%arg8 = %subview_11) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_16 = memref.subview %subview_9[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_17 = memref.subview %subview_10[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> +# CHECK-NEXT: %subview_18 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_20 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg9 = %c0_19 to %c16 step %c1_20 iter_args(%arg10 = %subview_18) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_22 = memref.subview %subview_16[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %subview_17[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_24 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_25 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_26 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg11 = %c0_25 to %c3 step %c1_26 iter_args(%arg12 = %subview_24) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_28 = memref.subview %subview_22[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_23[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_31 = arith.constant 0 : index +# CHECK-NEXT: %c3_32 = arith.constant 3 : index +# CHECK-NEXT: %c1_33 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg13 = %c0_31 to %c3_32 step %c1_33 iter_args(%arg14 = %subview_30) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_35 = memref.subview %subview_28[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_36 = memref.subview %subview_29[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_37 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_38 = arith.constant 0 : index +# CHECK-NEXT: %c3_39 = arith.constant 3 : index +# CHECK-NEXT: %c1_40 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg15 = %c0_38 to %c3_39 step %c1_40 iter_args(%arg16 = %subview_37) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_42 = memref.subview %subview_35[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_43 = memref.subview %subview_36[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_44 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_42, %subview_43 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_46: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_46 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 # CHECK-NEXT: } +# CHECK-NEXT: %subview_45 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_44, %subview_45 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %subview_41 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_41 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %subview_34 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %6, %subview_34 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg12 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_27 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_27 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_21 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_21 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_15 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_15 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_8 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x8x8x16xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> +# CHECK-NEXT: memref.copy %1, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } @@ -230,4 +470,3 @@ # CHECK-NEXT: - %2: conv2d(%0, %1, stride=(1, 1)) {name = 'O'} : [1x10x10x3xfloat32, 3x3x3x16xfloat32] -> [1x8x8x16xfloat32] # CHECK-NEXT: # CHECK-NEXT: CODE: 0 - diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py new file mode 100644 index 000000000..e9e9a91c1 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py @@ -0,0 +1,723 @@ +# RUN: python %s 2>&1 | filecheck %s + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend +from xtc.artifacts import get_operation +from xtc.artifacts import get_operation + +op = get_operation("conv2d", "ResNet18_01") +N, H, W, F, R, S, C = [op["dims"][k] for k in ["n", "h", "w", "f", "r", "s", "c"]] +SH, SW = [op["params"][k] for k in ["SH", "SW"]] +dtype = "float32" + +a = O.tensor((N, H + R - 1, W + S - 1, C), dtype) +b = O.tensor((R, S, C, F), dtype) + +with O.graph(name="conv2d_nhwc_r181") as gb: + O.conv2d(a, b, stride=(SH, SW), name="O") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sch.tile("w", {"w1": 4}) +sch.tile("f", {"f1": 16}) +sch.interchange(["b", "h", "w", "f", "r", "s", "c", "w1", "f1"]) +sch.vectorize(["f1"]) +sch.unroll({"w1": 4, "c": 3}) +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="conv2d_nhwc_r181_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x112x112x64xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32> +# CHECK-NEXT: %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%1 : tensor<1x112x112x64xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): +# CHECK-NEXT: %3 = arith.mulf %in, %in_0 : f32 +# CHECK-NEXT: %4 = arith.addf %out, %3 : f32 +# CHECK-NEXT: linalg.yield %4 : f32 +# CHECK-NEXT: } -> tensor<1x112x112x64xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./f" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./f" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./c" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_21 "./w1" : !transform.any_op +# CHECK-NEXT: transform.include @_vecto failures(suppress) (%tiled_linalg_op_20) : (!transform.any_op) -> () +# CHECK-NEXT: transform.loop.unroll %loops_21 {factor = 4 : i64} : !transform.any_op +# CHECK-NEXT: transform.loop.unroll %loops_19 {factor = 3 : i64} : !transform.any_op +# CHECK-NEXT: %2 = transform.get_parent_op %loops_7 {isolated_from_above} : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.reduction_to_contract +# CHECK-NEXT: transform.apply_patterns.vector.transfer_permutation_patterns +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.lower_outerproduct +# CHECK-NEXT: transform.apply_patterns.vector.lower_contraction +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %c6 = arith.constant 6 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c7 = arith.constant 7 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c112 = arith.constant 112 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x112x112x64xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32> +# CHECK-NEXT: %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32> +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32> +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg7) +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32> +# CHECK-NEXT: %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32> +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32> +# CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %9 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %c6 = arith.constant 6 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c7 = arith.constant 7 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c112 = arith.constant 112 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x112x112x64xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32> +# CHECK-NEXT: %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32> +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32> +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg7) +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32> +# CHECK-NEXT: %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32> +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32> +# CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %9 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: memref<1x230x230x3xf32> {llvm.noalias}, %arg1: memref<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %c6 = arith.constant 6 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c7 = arith.constant 7 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c112 = arith.constant 112 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_3 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %subview_3) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>) +# CHECK-NEXT: %subview_6 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_4 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_0 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %3 = affine.apply #map(%arg5) +# CHECK-NEXT: %subview_2 = memref.subview %subview[0, %3, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %subview_3) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %5 = affine.apply #map(%arg7) +# CHECK-NEXT: %subview_5 = memref.subview %subview_2[0, 0, %5, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %subview_6) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %subview_9) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_11 = memref.subview %subview_5[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_8[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %8 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_17 = memref.subview %subview_15[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_18 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_17, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_18 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_19 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_18, %subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_20 = memref.subview %subview_15[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_21 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_20, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_21 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_22 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_21, %subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %subview_15[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_24 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_23, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_24 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_25 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_24, %subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_26 = memref.subview %subview_15[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_27 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_26, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_27 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_28 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_27, %subview_28 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_13[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_30 = memref.subview %subview_14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_31 = memref.subview %subview_29[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_32 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_31, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_32 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_33 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_32, %subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_34 = memref.subview %subview_29[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_35 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_34, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_35 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_36 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_35, %subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_37 = memref.subview %subview_29[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_38 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_39 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_38, %subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_40 = memref.subview %subview_29[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_41 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_41 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_42 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_41, %subview_42 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_43 = memref.subview %subview_13[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_44 = memref.subview %subview_14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_45 = memref.subview %subview_43[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_46 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_45, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_46 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_47 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_46, %subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_48 = memref.subview %subview_43[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_49 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_48, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_49 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_50 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_49, %subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_51 = memref.subview %subview_43[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_52 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_51, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_52 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_53 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_52, %subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_54 = memref.subview %subview_43[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_55 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_54, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_55 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): +# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 +# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 +# CHECK-NEXT: linalg.yield %10 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_56 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_55, %subview_56 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg14 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_10 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_10 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_7 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %6, %subview_7 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_1 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: memref.copy %1, %arg2 : memref<1x112x112x64xf32> to memref<1x112x112x64xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: conv2d_nhwc_r181 +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 1x230x230x3xfloat32 +# CHECK-NEXT: - %1 : 7x7x3x64xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 1x112x112x64xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: conv2d(%0, %1, stride=(2, 2)) {name = 'O'} : [1x230x230x3xfloat32, 7x7x3x64xfloat32] -> [1x112x112x64xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py index 5a7ed668e..b240b6bbd 100644 --- a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py @@ -30,8 +30,9 @@ executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before Tensor Lowering //----- // -# CHECK-NEXT: module { + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 @@ -40,28 +41,6 @@ # CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: module { -# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>) -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>) -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { # CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op # CHECK-NEXT: transform.yield @@ -85,47 +64,178 @@ # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_3 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_3 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { -# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %c0_3 = arith.constant 0 : index # CHECK-NEXT: %c32 = arith.constant 32 : index # CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_3 to %c32 step %c1_4 { -# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %2 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_5 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %c0_0 = arith.constant 0 : index # CHECK-NEXT: %c4_1 = arith.constant 4 : index # CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 { +# CHECK-NEXT: %1 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %0) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> # CHECK-NEXT: %subview_3 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_4 = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %c0_5 = arith.constant 0 : index # CHECK-NEXT: %c32 = arith.constant 32 : index # CHECK-NEXT: %c1_6 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_5 to %c32 step %c1_6 { -# CHECK-NEXT: %subview_7 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_8 = memref.subview %subview_3[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %subview_4[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %2 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %subview_4) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_3[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c512 = arith.constant 512 : index -# CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_10 to %c512 step %c1_11 { -# CHECK-NEXT: %subview_12 = memref.subview %subview_7[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_13 = memref.subview %subview_8[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_14 = memref.subview %subview_9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg7 = %c0_11 to %c512 step %c1_12 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_14 = memref.subview %subview_8[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_14, %subview_15 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_16 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_17 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_16, %subview_17 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_13 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_7 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_7 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: memref.copy %1, %arg2 : memref<4x32xf32> to memref<4x32xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } @@ -141,4 +251,3 @@ # CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] # CHECK-NEXT: # CHECK-NEXT: CODE: 0 - diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py index 037bc5f53..04095b436 100644 --- a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py @@ -18,6 +18,11 @@ impl = Backend(graph, use_tensor_dialect=True) sch = impl.get_scheduler(default_node="matmul") +sch.tile("i", {"i1": 2}) +sch.tile("j", {"j1": 16}) +sch.interchange(["k", "i", "j", "i1", "j1"]) +sch.vectorize(["j1"]) +sch.unroll({"i1": 2}) sched = sch.schedule() comp = impl.get_compiler( @@ -31,17 +36,18 @@ executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before Tensor Lowering //----- // + +# CHECK: // -----// IR Dump Before transform //----- // # CHECK-NEXT: #map = affine_map<(d0) -> (d0)> # CHECK-NEXT: #map1 = affine_map<(d0) -> ()> -# CHECK-NEXT: module { +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> # CHECK-NEXT: %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> # CHECK-NEXT: %3 = tensor.empty() : tensor<4x32xf32> -# CHECK-NEXT: %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> +# CHECK-NEXT: %collapsed = tensor.collapse_shape %2 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> # CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %5 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapsed, %cst_0 : tensor<128xf32>, f32) outs(%4 : tensor<128xf32>) attrs = {__xtc_id_relu_} { @@ -53,52 +59,6 @@ # CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> -# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> -# CHECK-NEXT: module { -# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> -# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs = {__xtc_id_relu_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): -# CHECK-NEXT: %0 = arith.maximumf %in, %in_2 : f32 -# CHECK-NEXT: linalg.yield %0 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> -# CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> -# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> -# CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> -# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs = {__xtc_id_relu_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): -# CHECK-NEXT: %0 = arith.maximumf %in, %in_2 : f32 -# CHECK-NEXT: linalg.yield %0 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> -# CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { # CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op # CHECK-NEXT: transform.yield @@ -110,15 +70,28 @@ # CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op # CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op -# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_9 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./k" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i1" : !transform.any_op +# CHECK-NEXT: transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> () +# CHECK-NEXT: transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op +# CHECK-NEXT: %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.reduction_to_contract +# CHECK-NEXT: transform.apply_patterns.vector.transfer_permutation_patterns +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.lower_outerproduct +# CHECK-NEXT: transform.apply_patterns.vector.lower_contraction +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./i" : !transform.any_op # CHECK-NEXT: transform.yield # CHECK-NEXT: } # CHECK-NEXT: } @@ -127,63 +100,280 @@ # CHECK-NEXT: #map = affine_map<(d0) -> (d0)> # CHECK-NEXT: #map1 = affine_map<(d0) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> +# CHECK-NEXT: %0 = ub.poison : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_4 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> +# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) { +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %13 = vector.broadcast %12 : f32 to vector<16xf32> +# CHECK-NEXT: %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %15 = vector.fma %13, %11, %14 : vector<16xf32> +# CHECK-NEXT: %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %23 = vector.broadcast %22 : f32 to vector<16xf32> +# CHECK-NEXT: %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %25 = vector.fma %23, %21, %24 : vector<16xf32> +# CHECK-NEXT: %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<2x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.yield %6 : tensor<4x32xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c128 = arith.constant 128 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> +# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_4: f32, %out: f32): +# CHECK-NEXT: %7 = arith.maximumf %in, %in_4 : f32 +# CHECK-NEXT: linalg.yield %7 : f32 +# CHECK-NEXT: } -> tensor<1xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<128xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> +# CHECK-NEXT: %0 = ub.poison : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { -# CHECK-NEXT: %subview = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_7 = arith.constant 0 : index -# CHECK-NEXT: %c32 = arith.constant 32 : index -# CHECK-NEXT: %c1_8 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_7 to %c32 step %c1_8 { -# CHECK-NEXT: %subview_9 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_9 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_4 : tensor<1x32xf32> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_0 = arith.constant 0 : index -# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> +# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) { +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %13 = vector.broadcast %12 : f32 to vector<16xf32> +# CHECK-NEXT: %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %15 = vector.fma %13, %11, %14 : vector<16xf32> +# CHECK-NEXT: %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %23 = vector.broadcast %22 : f32 to vector<16xf32> +# CHECK-NEXT: %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %25 = vector.fma %23, %21, %24 : vector<16xf32> +# CHECK-NEXT: %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<2x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.yield %6 : tensor<4x32xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c128 = arith.constant 128 : index # CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 { -# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_7 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_8 = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c32 = arith.constant 32 : index -# CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_9 to %c32 step %c1_10 { -# CHECK-NEXT: %subview_11 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_12 = memref.subview %subview_7[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_13 = memref.subview %subview_8[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_14 = arith.constant 0 : index -# CHECK-NEXT: %c512 = arith.constant 512 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_14 to %c512 step %c1_15 { -# CHECK-NEXT: %subview_16 = memref.subview %subview_11[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_17 = memref.subview %subview_12[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_18 = memref.subview %subview_13[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_} ins(%subview_16, %subview_17 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_18 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> +# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_4: f32, %out: f32): +# CHECK-NEXT: %7 = arith.maximumf %in, %in_4 : f32 +# CHECK-NEXT: linalg.yield %7 : f32 +# CHECK-NEXT: } -> tensor<1xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<128xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> +# CHECK-NEXT: %0 = ub.poison : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %alloca) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> -# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %1) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_6) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %subview_4[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_5[%c0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %6 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> +# CHECK-NEXT: %7 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %subview_11[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %9 = vector.extract %7[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %10 = vector.extract %6[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %11 = vector.broadcast %10 : f32 to vector<16xf32> +# CHECK-NEXT: %12 = vector.extract %8[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %13 = vector.fma %11, %9, %12 : vector<16xf32> +# CHECK-NEXT: %14 = vector.insert %13, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: vector.transfer_write %14, %subview_11[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_11, %subview_12 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_5[%c1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_14 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %15 = vector.transfer_read %subview_13[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> +# CHECK-NEXT: %16 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_read %subview_14[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %18 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %19 = vector.extract %15[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %20 = vector.broadcast %19 : f32 to vector<16xf32> +# CHECK-NEXT: %21 = vector.extract %17[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %22 = vector.fma %20, %18, %21 : vector<16xf32> +# CHECK-NEXT: %23 = vector.insert %22, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: vector.transfer_write %23, %subview_14[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_14, %subview_15 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_9, %subview_16 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_7 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_7 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.yield %4 : memref<4x32xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %collapse_shape = memref.collapse_shape %2 [[0, 1]] : memref<4x32xf32> into memref<128xf32> +# CHECK-NEXT: %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> +# CHECK-NEXT: %c0_2 = arith.constant 0 : index # CHECK-NEXT: %c128 = arith.constant 128 : index -# CHECK-NEXT: %c1_6 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_5 to %c128 step %c1_6 { +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg3 = %c0_2 to %c128 step %c1_3 iter_args(%arg4 = %alloca_1) -> (memref<128xf32>) { # CHECK-NEXT: %subview = memref.subview %collapse_shape[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> -# CHECK-NEXT: %subview_7 = memref.subview %alloca_3[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_4 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_7 : memref<1xf32, strided<[1], offset: ?>>) attrs = {__xtc_id_relu_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_8: f32, %out: f32): -# CHECK-NEXT: %0 = arith.maximumf %in, %in_8 : f32 -# CHECK-NEXT: linalg.yield %0 : f32 +# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_0 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_4 : memref<1xf32, strided<[1], offset: ?>>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_6: f32, %out: f32): +# CHECK-NEXT: %4 = arith.maximumf %in, %in_6 : f32 +# CHECK-NEXT: linalg.yield %4 : f32 # CHECK-NEXT: } +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_4, %subview_5 : memref<1xf32, strided<[1], offset: ?>> to memref<1xf32, strided<[1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<128xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %expand_shape = memref.expand_shape %alloca_3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> +# CHECK-NEXT: %expand_shape = memref.expand_shape %3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> # CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py index 68c2c7617..255c463df 100644 --- a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py @@ -32,11 +32,12 @@ executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before Tensor Lowering //----- // + +# CHECK: // -----// IR Dump Before transform //----- // # CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> # CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> -# CHECK-NEXT: module { +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 @@ -45,7 +46,7 @@ # CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %3 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%2 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> -# CHECK-NEXT: %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%0, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%inserted_slice, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { # CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): # CHECK-NEXT: %5 = arith.mulf %in, %in_1 : f32 # CHECK-NEXT: %6 = arith.addf %out, %5 : f32 @@ -54,54 +55,6 @@ # CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> -# CHECK-NEXT: module { -# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>) -# CHECK-NEXT: %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> -# CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> -# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>) -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): -# CHECK-NEXT: %0 = arith.mulf %in, %in_1 : f32 -# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 -# CHECK-NEXT: linalg.yield %1 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> -# CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>) -# CHECK-NEXT: %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> -# CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> -# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>) -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): -# CHECK-NEXT: %0 = arith.mulf %in, %in_1 : f32 -# CHECK-NEXT: %1 = arith.addf %out, %0 : f32 -# CHECK-NEXT: linalg.yield %1 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { # CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op # CHECK-NEXT: transform.yield @@ -116,7 +69,7 @@ # CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_5 "./c" : !transform.any_op -# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op # CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -124,31 +77,22 @@ # CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_13 "./c" : !transform.any_op -# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./f" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_15 "./b" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_17 "./h" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_19 "./w" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_21 "./f" : !transform.any_op -# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_23 "./b" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_25 "./h" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_27 "./w" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_29 "./f" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_31 "./r" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_33 "./s" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_35 "./c" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_23 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_25 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_27 "./c" : !transform.any_op # CHECK-NEXT: transform.yield # CHECK-NEXT: } # CHECK-NEXT: } @@ -159,127 +103,494 @@ # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> # CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c12_13 = arith.constant 12 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c4_13 = arith.constant 4 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg5) +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c4_17 = arith.constant 4 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %8 = affine.apply #map(%arg7) +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_23 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_24 = arith.constant 1 : index +# CHECK-NEXT: %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_29 = arith.constant 0 : index +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c1_30 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_35 = arith.constant 0 : index +# CHECK-NEXT: %c5_36 = arith.constant 5 : index +# CHECK-NEXT: %c1_37 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_42 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_43 = arith.constant 1 : index +# CHECK-NEXT: %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_49: f32, %out: f32): +# CHECK-NEXT: %14 = arith.mulf %in, %in_49 : f32 +# CHECK-NEXT: %15 = arith.addf %out, %14 : f32 +# CHECK-NEXT: linalg.yield %15 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c12_13 = arith.constant 12 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c4_13 = arith.constant 4 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg5) +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c4_17 = arith.constant 4 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %8 = affine.apply #map(%arg7) +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_23 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_24 = arith.constant 1 : index +# CHECK-NEXT: %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_29 = arith.constant 0 : index +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c1_30 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_35 = arith.constant 0 : index +# CHECK-NEXT: %c5_36 = arith.constant 5 : index +# CHECK-NEXT: %c1_37 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_42 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_43 = arith.constant 1 : index +# CHECK-NEXT: %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_49: f32, %out: f32): +# CHECK-NEXT: %14 = arith.mulf %in, %in_49 : f32 +# CHECK-NEXT: %15 = arith.addf %out, %14 : f32 +# CHECK-NEXT: linalg.yield %15 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { # CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c1_0 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0 to %c1 step %c1_0 { -# CHECK-NEXT: %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) { +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: %c0_9 = arith.constant 0 : index # CHECK-NEXT: %c12 = arith.constant 12 : index # CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_9 to %c12 step %c1_10 { -# CHECK-NEXT: %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c12_13 = arith.constant 12 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_12 to %c12_13 step %c1_14 { -# CHECK-NEXT: %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_9 to %c12 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c12_14 = arith.constant 12 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_13 to %c12_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { +# CHECK-NEXT: %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index # CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_17 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_16 to %c3 step %c1_17 { -# CHECK-NEXT: %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>) +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c3 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { +# CHECK-NEXT: %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>) +# CHECK-NEXT: %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_16 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_11 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x12x12x3xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> +# CHECK-NEXT: %subview = memref.subview %0[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> # CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> # CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_2 = arith.constant 0 : index # CHECK-NEXT: %c1_3 = arith.constant 1 : index # CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 { -# CHECK-NEXT: %subview_8 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) { +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: %c0_9 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_9 to %c4 step %c1_10 { -# CHECK-NEXT: %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c4_13 = arith.constant 4 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_12 to %c4_13 step %c1_14 { -# CHECK-NEXT: %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_9 to %c4 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c4_14 = arith.constant 4 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_13 to %c4_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_17 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_16 to %c16 step %c1_17 { -# CHECK-NEXT: %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) +# CHECK-NEXT: %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_16 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_11 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> # CHECK-NEXT: } {"./b"} # CHECK-NEXT: %c0_5 = arith.constant 0 : index # CHECK-NEXT: %c1_6 = arith.constant 1 : index # CHECK-NEXT: %c1_7 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 { -# CHECK-NEXT: %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) { +# CHECK-NEXT: %subview_8 = memref.subview %0[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1_12 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_11 to %c4 step %c1_12 { -# CHECK-NEXT: %0 = affine.apply #map(%arg4) -# CHECK-NEXT: %subview_13 = memref.subview %subview_8[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_14 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_10[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index -# CHECK-NEXT: %c4_17 = arith.constant 4 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_16 to %c4_17 step %c1_18 { -# CHECK-NEXT: %1 = affine.apply #map(%arg5) -# CHECK-NEXT: %subview_19 = memref.subview %subview_13[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_20 = memref.subview %subview_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> -# CHECK-NEXT: %subview_21 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_11 to %c4 step %c1_12 iter_args(%arg6 = %subview_10) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %subview_14 = memref.subview %subview_8[0, %4, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> +# CHECK-NEXT: %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c4_18 = arith.constant 4 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg7 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg8 = %subview_16) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg7) +# CHECK-NEXT: %subview_21 = memref.subview %subview_14[0, 0, %6, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %subview_15[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> +# CHECK-NEXT: %subview_23 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_23 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_22 to %c16 step %c1_23 { -# CHECK-NEXT: %subview_24 = memref.subview %subview_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_25 = memref.subview %subview_20[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_26 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_27 = arith.constant 0 : index +# CHECK-NEXT: %c1_25 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg9 = %c0_24 to %c16 step %c1_25 iter_args(%arg10 = %subview_23) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_27 = memref.subview %subview_21[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_28 = memref.subview %subview_22[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_30 = arith.constant 0 : index # CHECK-NEXT: %c5 = arith.constant 5 : index -# CHECK-NEXT: %c1_28 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg7 = %c0_27 to %c5 step %c1_28 { -# CHECK-NEXT: %subview_29 = memref.subview %subview_24[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_30 = memref.subview %subview_25[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_31 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_32 = arith.constant 0 : index -# CHECK-NEXT: %c5_33 = arith.constant 5 : index -# CHECK-NEXT: %c1_34 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg8 = %c0_32 to %c5_33 step %c1_34 { -# CHECK-NEXT: %subview_35 = memref.subview %subview_29[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_36 = memref.subview %subview_30[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_37 = memref.subview %subview_31[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_38 = arith.constant 0 : index +# CHECK-NEXT: %c1_31 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg11 = %c0_30 to %c5 step %c1_31 iter_args(%arg12 = %subview_29) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_33 = memref.subview %subview_27[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_34 = memref.subview %subview_28[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_35 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_36 = arith.constant 0 : index +# CHECK-NEXT: %c5_37 = arith.constant 5 : index +# CHECK-NEXT: %c1_38 = arith.constant 1 : index +# CHECK-NEXT: %9 = scf.for %arg13 = %c0_36 to %c5_37 step %c1_38 iter_args(%arg14 = %subview_35) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_40 = memref.subview %subview_33[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_41 = memref.subview %subview_34[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_42 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %c0_43 = arith.constant 0 : index # CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_39 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg9 = %c0_38 to %c3 step %c1_39 { -# CHECK-NEXT: %subview_40 = memref.subview %subview_35[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_41 = memref.subview %subview_36[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_42 = memref.subview %subview_37[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_41 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_42 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_43: f32, %out: f32): -# CHECK-NEXT: %2 = arith.mulf %in, %in_43 : f32 -# CHECK-NEXT: %3 = arith.addf %out, %2 : f32 -# CHECK-NEXT: linalg.yield %3 : f32 +# CHECK-NEXT: %c1_44 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg15 = %c0_43 to %c3 step %c1_44 iter_args(%arg16 = %subview_42) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_46 = memref.subview %subview_40[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_47 = memref.subview %subview_41[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_48 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_47 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_48 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_50: f32, %out: f32): +# CHECK-NEXT: %11 = arith.mulf %in, %in_50 : f32 +# CHECK-NEXT: %12 = arith.addf %out, %11 : f32 +# CHECK-NEXT: linalg.yield %12 : f32 # CHECK-NEXT: } +# CHECK-NEXT: %subview_49 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_48, %subview_49 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg16 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %subview_45 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %10, %subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %subview_39 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %9, %subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg12 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_32 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %8, %subview_32 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_26 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_26 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_13 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_13 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> +# CHECK-NEXT: memref.copy %2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py index 34c09d96e..b05c8a8d7 100644 --- a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py @@ -31,8 +31,9 @@ executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before Tensor Lowering //----- // -# CHECK-NEXT: module { + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 @@ -45,58 +46,12 @@ # CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %5 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%4 : tensor<16x16xf32>) -> tensor<16x16xf32> -# CHECK-NEXT: %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%0, %2 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%inserted_slice, %inserted_slice_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32> # CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> # CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: module { -# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>) -# CHECK-NEXT: %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>) -# CHECK-NEXT: %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>) -# CHECK-NEXT: %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>) -# CHECK-NEXT: %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>) -# CHECK-NEXT: %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>) -# CHECK-NEXT: %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { # CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op # CHECK-NEXT: transform.yield @@ -107,120 +62,332 @@ # CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op -# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op # CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op -# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op # CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_7 "./i" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_9 "./j" : !transform.any_op -# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_11 "./i" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_13 "./j" : !transform.any_op -# CHECK-NEXT: %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_15 "./i" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_17 "./j" : !transform.any_op -# CHECK-NEXT: %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_19 "./i" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_21 "./j" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_23 "./k" : !transform.any_op -# CHECK-NEXT: %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_25 "./i" : !transform.any_op -# CHECK-NEXT: %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -# CHECK-NEXT: transform.annotate %loops_27 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./k" : !transform.any_op # CHECK-NEXT: transform.yield # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16_14 = arith.constant 16 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c16_2 = arith.constant 16 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16_14 = arith.constant 16 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_5 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c16_7 = arith.constant 16 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16_14 = arith.constant 16 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c16_10 = arith.constant 16 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c16_16 = arith.constant 16 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c16_23 = arith.constant 16 : index +# CHECK-NEXT: %c1_24 = arith.constant 1 : index +# CHECK-NEXT: %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_29 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16_14 = arith.constant 16 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c16_2 = arith.constant 16 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16_14 = arith.constant 16 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_5 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c16_7 = arith.constant 16 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %c16_14 = arith.constant 16 : index +# CHECK-NEXT: %c1_15 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c16_10 = arith.constant 16 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c16_16 = arith.constant 16 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c16_23 = arith.constant 16 : index +# CHECK-NEXT: %c1_24 = arith.constant 1 : index +# CHECK-NEXT: %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_29 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { # CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0 to %c16 step %c1 { -# CHECK-NEXT: %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: %c0_16 = arith.constant 0 : index # CHECK-NEXT: %c16_17 = arith.constant 16 : index # CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 { -# CHECK-NEXT: %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %subview = memref.subview %0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> # CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_2 = arith.constant 0 : index # CHECK-NEXT: %c16_3 = arith.constant 16 : index # CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 { -# CHECK-NEXT: %subview_15 = memref.subview %alloca_0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: %c0_16 = arith.constant 0 : index # CHECK-NEXT: %c16_17 = arith.constant 16 : index # CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 { -# CHECK-NEXT: %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview_5 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %subview_5 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: memref.copy %arg1, %subview_5 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> # CHECK-NEXT: %cst_7 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_8 = arith.constant 0 : index # CHECK-NEXT: %c16_9 = arith.constant 16 : index # CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 { -# CHECK-NEXT: %subview_15 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %alloca_6) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: %c0_16 = arith.constant 0 : index # CHECK-NEXT: %c16_17 = arith.constant 16 : index # CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 { -# CHECK-NEXT: %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c16_12 = arith.constant 16 : index # CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 { -# CHECK-NEXT: %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>> -# CHECK-NEXT: %subview_17 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 iter_args(%arg4 = %2) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_15 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %1[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>> +# CHECK-NEXT: %subview_17 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: %c0_18 = arith.constant 0 : index # CHECK-NEXT: %c16_19 = arith.constant 16 : index # CHECK-NEXT: %c1_20 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_18 to %c16_19 step %c1_20 { -# CHECK-NEXT: %subview_21 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_22 = memref.subview %subview_16[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_23 = memref.subview %subview_17[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %c0_24 = arith.constant 0 : index -# CHECK-NEXT: %c16_25 = arith.constant 16 : index -# CHECK-NEXT: %c1_26 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_24 to %c16_25 step %c1_26 { -# CHECK-NEXT: %subview_27 = memref.subview %subview_21[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_28 = memref.subview %subview_22[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_29 = memref.subview %subview_23[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %4 = scf.for %arg5 = %c0_18 to %c16_19 step %c1_20 iter_args(%arg6 = %subview_17) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_22 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %subview_16[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_24 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c0_25 = arith.constant 0 : index +# CHECK-NEXT: %c16_26 = arith.constant 16 : index +# CHECK-NEXT: %c1_27 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg7 = %c0_25 to %c16_26 step %c1_27 iter_args(%arg8 = %subview_24) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_29 = memref.subview %subview_22[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_30 = memref.subview %subview_23[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_31 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_29, %subview_30 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_31 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_32 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_31, %subview_32 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_28 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_21 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_21 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview_14 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %subview_14 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: memref.copy %subview_14, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py index c748923b8..7a9eb3442 100644 --- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -32,8 +32,9 @@ executor = module.get_executor(validate=True) res = executor.execute() print(f"CODE: {res}") -# CHECK: // -----// IR Dump Before Tensor Lowering //----- // -# CHECK-NEXT: module { + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 @@ -42,40 +43,10 @@ # CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %4 = linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%3 : tensor<32x32xf32>) -> tensor<32x32xf32> -# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %0 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %2 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32> # CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: module { -# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) -# CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK-NEXT: } -# CHECK-NEXT: -# CHECK-NEXT: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>) -# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>) -# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>) -# CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> -# CHECK-NEXT: return -# CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { # CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op # CHECK-NEXT: transform.yield @@ -111,87 +82,333 @@ # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After transform //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { -# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0 to %c4 step %c1 { -# CHECK-NEXT: %subview = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %c0_9 = arith.constant 0 : index # CHECK-NEXT: %c32_10 = arith.constant 32 : index # CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 { -# CHECK-NEXT: %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %c0_0 = arith.constant 0 : index # CHECK-NEXT: %c4_1 = arith.constant 4 : index # CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 { -# CHECK-NEXT: %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c32_12 = arith.constant 32 : index # CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 { -# CHECK-NEXT: %subview_14 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> # CHECK-NEXT: %c0_17 = arith.constant 0 : index # CHECK-NEXT: %c512 = arith.constant 512 : index # CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c512 step %c1_18 { -# CHECK-NEXT: %subview_19 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_20 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_21 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%subview_19, %subview_20 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x1xf32> # CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x32xf32> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> # CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_4 = arith.constant 0 : index # CHECK-NEXT: %c32 = arith.constant 32 : index # CHECK-NEXT: %c1_5 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_4 to %c32 step %c1_5 { -# CHECK-NEXT: %subview = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %c0_9 = arith.constant 0 : index # CHECK-NEXT: %c32_10 = arith.constant 32 : index # CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 { -# CHECK-NEXT: %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %c0_6 = arith.constant 0 : index # CHECK-NEXT: %c32_7 = arith.constant 32 : index # CHECK-NEXT: %c1_8 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 { -# CHECK-NEXT: %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %alloca[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %c0_11 = arith.constant 0 : index # CHECK-NEXT: %c32_12 = arith.constant 32 : index # CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 { -# CHECK-NEXT: %subview_14 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, %arg5] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> # CHECK-NEXT: %c0_17 = arith.constant 0 : index # CHECK-NEXT: %c4_18 = arith.constant 4 : index # CHECK-NEXT: %c1_19 = arith.constant 1 : index -# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c4_18 step %c1_19 { -# CHECK-NEXT: %subview_20 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_21 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_22 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%subview_20, %subview_21 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_22 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> +# CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c32_7 = arith.constant 32 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c4_18 = arith.constant 4 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %0 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %alloca) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %0) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_15 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_9[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg8 = %c0_18 to %c512 step %c1_19 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_21 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%subview_21, %subview_22 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_23 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_23, %subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_20 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c32_7 = arith.constant 32 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %2) -> (memref<32x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %1[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_15 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_9[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index +# CHECK-NEXT: %c4_19 = arith.constant 4 : index +# CHECK-NEXT: %c1_20 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg8 = %c0_18 to %c4_19 step %c1_20 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_22 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%subview_22, %subview_23 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_25 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_24, %subview_25 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_21 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<32x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32> +# CHECK-NEXT: memref.copy %3, %arg3 : memref<32x32xf32> to memref<32x32xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } From e9b27d0c0e198ae7fad05bf492a1579b51fe7692 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Fri, 13 Feb 2026 15:04:07 +0100 Subject: [PATCH 11/14] tensor-dialect: multi-output graphs and nodes support --- src/xtc/backends/mlir/MlirGraphBackend.py | 30 +++++++++++------------ src/xtc/backends/mlir/MlirOps.py | 10 ++++---- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index 0c05ebd16..ab973fb68 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -94,13 +94,12 @@ def _xdsl_generate_node( variables[name] = alloca.results[0] args = [variables[name] for name in names] _, attrs = operation.generate(block=block, args=args) - last_node = attrs["nodes_map"].get("return_node_id") # the tensor dialect needs the result of the op, not the alloca if self.xdsl_type == TensorType: - # for name in node.outputs: - assert len(node.outputs) == 1 - variables[node.outputs[0]] = last_node.results[0] - return attrs, last_node + assert len(node.outputs) == len(attrs["output_nodes"]) + for name, output in zip(node.outputs, attrs["output_nodes"]): + variables[name] = output.results[0] + return attrs def _init_from_graph( self, @@ -129,22 +128,21 @@ def _init_from_graph( for name, arg in zip([*graph.inputs, *graph.outputs], inlined_block.args) } block_attrs = [] - last_node = None for node in graph.nodes.values(): - node_attrs, last_node = self._xdsl_generate_node( - node, inlined_block, variables - ) + node_attrs = self._xdsl_generate_node(node, inlined_block, variables) block_attrs.append(node_attrs) with ImplicitBuilder(inlined_block): if self.xdsl_type == TensorType: - assert last_node - # write the final tensor value to the output buffer - dest = bufferization.MaterializeInDestinationOp( - operands=((last_node.results[0],), (inlined_block.args[-1],)), - result_types=((),), - attributes={"writable": UnitAttr(), "restrict": UnitAttr()}, - ) + # write the final tensor values to the output buffers + for name, out_arg in zip( + graph.outputs, inlined_block.args[-len(graph.outputs) :] + ): + bufferization.MaterializeInDestinationOp( + operands=((variables[name],), (out_arg,)), + result_types=((),), + attributes={"writable": UnitAttr(), "restrict": UnitAttr()}, + ) func.ReturnOp() region = Region([inlined_block]) # type: ignore # issue with mypy payload = xdslFuncOp.from_region( diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index df8a4aba0..c7b0eb411 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -192,12 +192,12 @@ def generate_op( "nodes_map": { fill_node_id: fill, reduce_node_id: reduce, - "return_node_id": reduce, }, "dims_sizes": [ {"i": Ki, "j": Kj}, self.dims_sizes(), ], + "output_nodes": [reduce], } return block, attrs @@ -341,12 +341,12 @@ def generate_op( "nodes_map": { fill_node_id: fill, reduce_node_id: reduce, - "return_node_id": reduce, }, "dims_sizes": [ {"b": Kb, "h": Kh, "w": Kw, "f": Kf}, self.dims_sizes(), ], + "output_nodes": [reduce], } return block, attrs @@ -497,11 +497,11 @@ def generate_op( attrs = { "nodes_map": { relu_node_id: relu, - "return_node_id": relu_result, }, "dims_sizes": [ self.dims_sizes(), ], + "output_nodes": [relu_result], } return block, attrs @@ -613,12 +613,12 @@ def generate_op( "nodes_map": { fill_node_id: fill, copy_node_id: None if using_tensors else copy, - "return_node_id": copy, }, "dims_sizes": [ self.dims_sizes(), *([] if using_tensors else [self.dims_sizes()]), ], + "output_nodes": [copy], } return block, attrs @@ -728,9 +728,9 @@ def generate_op( attrs = { "nodes_map": { copy_node_id: None if using_tensors else copy, - "return_node_id": copy, }, "dims_sizes": [*([] if using_tensors else [self.dims_sizes()])], + "output_nodes": [copy], } return block, attrs From b06b0f2abbf92145f4f6a13984291d1e84523402 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Tue, 24 Feb 2026 11:32:14 +0100 Subject: [PATCH 12/14] tensor-dialect: changed to non-collapse relu, added fusion cleanup passes --- src/xtc/backends/mlir/MlirCompilerPasses.py | 2 + src/xtc/backends/mlir/MlirOps.py | 81 ++-- .../test_conv2d_mini_mlir_tensor.py | 180 +++------ .../test_conv2d_r181_mlir_tensor.py | 369 +++++++++--------- .../tensor_dialect/test_matmul_mlir_tensor.py | 65 ++- .../test_matmul_relu_mlir_tensor.py | 294 +++++++------- .../test_pad_conv2d_mlir_tensor.py | 244 +++++------- .../test_pad_matmul_unpad_mlir_tensor.py | 133 +++---- .../test_two_matmuls_mlir_tensor.py | 130 +++--- 9 files changed, 622 insertions(+), 876 deletions(-) diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index a52e6aa21..804adce6b 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -563,6 +563,8 @@ def apply_bufferization_passes(mlir_program: RawMlirProgram): bufferize_options.append("buffer-alignment=256") apply_passes.run( [ + "canonicalize", + "cse", "eliminate-empty-tensors", # causes ops to write directly to out buffer f"one-shot-bufferize{{{' '.join(bufferize_options)}}}", "func.func(buffer-hoisting)", diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index c7b0eb411..836f7d2e7 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -421,77 +421,58 @@ def generate_op( ] ) if self.op_type == TensorType: - inp = tensor.CollapseShapeOp( # type: ignore - operands=[args[0]], - properties=dict(reassociation=inp_reassociation), - result_types=[self.op_type(elt_type, (inp_size,))], - ) - # create empty tensor for collapsed output shape - out_empty = tensor.EmptyOp([], TensorType(elt_type, [out_size])) - out_operand = out_empty.tensor + out_operand = args[1] + inp_operand = args[0] + rank = len(out_shape) + iterator_types = [StringAttr("parallel")] * rank + indexing_maps = [ + AffineMapAttr(AffineMap.identity(rank)), # input + AffineMapAttr( + AffineMap.identity(rank).drop_results(out_shape) + ), # scalar + AffineMapAttr(AffineMap.identity(rank)), # output + ] else: inp = memref.CollapseShapeOp( # type: ignore operands=[args[0]], properties=dict(reassociation=inp_reassociation), result_types=[self.op_type(elt_type, (inp_size,))], ) + inp_operand = inp.results[0] # type: ignore out = memref.CollapseShapeOp( operands=[args[1]], properties=dict(reassociation=out_reassociation), result_types=[self.op_type(elt_type, (out_size,))], ) - out_operand = out.results[0] - - result = ( - (TensorType(elt_type, [out_size]),) - if self.op_type == TensorType - else () - ) + out_operand = out.results[0] # type: ignore + iterator_types = [ + StringAttr({"P": "parallel", "R": "reduction"}[k]) + for k in self.KINDS + ] + # ignore typing due to xdsl hints limitation + indexing_maps = [ + AffineMapAttr(AffineMap.from_callable(lambda i: (i,))), # type: ignore + AffineMapAttr(AffineMap.from_callable(lambda _: ())), # type: ignore + AffineMapAttr(AffineMap.from_callable(lambda i: (i,))), # type: ignore + ] + iterator_types = [ + StringAttr({"P": "parallel", "R": "reduction"}[k]) + for k in self.KINDS + ] + result = (args[1].type,) if self.op_type == TensorType else () cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) - iterator_types = [ - StringAttr({"P": "parallel", "R": "reduction"}[k]) for k in self.KINDS - ] block_in = Block(arg_types=[f32, f32, f32]) with ImplicitBuilder(block_in): max = arith.MaximumfOp(block_in.args[0], block_in.args[1]) linalg.YieldOp(max) relu = linalg.GenericOp( - inputs=(inp.results[0], cst0.results[0]), + inputs=(inp_operand, cst0.results[0]), outputs=(out_operand,), body=Region([block_in]), # type: ignore # mypy issue with dataclass - # ignore typing due to xdsl hints limitation - indexing_maps=[ - AffineMapAttr( - AffineMap.from_callable( - lambda i: # type: ignore - (i,) - ) - ), - AffineMapAttr( - AffineMap.from_callable( - lambda _: # type: ignore - () - ) - ), - AffineMapAttr( - AffineMap.from_callable( - lambda i: # type: ignore - (i,) - ) - ), - ], + indexing_maps=indexing_maps, iterator_types=iterator_types, result_types=result, ) - relu_result = None - if self.op_type == TensorType: - relu_result = tensor.ExpandShapeOp( - relu.results[0], - reassociation=out_reassociation, - result_type=TensorType(elt_type, out_shape), - static_output_shape=out_shape, - dynamic_output_shape=[], - ) relu_node_id = f"{self.name}" relu.attributes[f"__xtc_id_{relu_node_id}_"] = UnitAttr() attrs = { @@ -501,7 +482,7 @@ def generate_op( "dims_sizes": [ self.dims_sizes(), ], - "output_nodes": [relu_result], + "output_nodes": [relu], } return block, attrs diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py index bd8db60bd..a6791aaad 100644 --- a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py @@ -333,127 +333,73 @@ # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: %c1_0 = arith.constant 1 : index -# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) { -# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_4 = arith.constant 0 : index -# CHECK-NEXT: %c8 = arith.constant 8 : index -# CHECK-NEXT: %c1_5 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %subview) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_8 = arith.constant 0 : index -# CHECK-NEXT: %c8_9 = arith.constant 8 : index -# CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg7 = %c0_8 to %c8_9 step %c1_10 iter_args(%arg8 = %subview_7) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_12 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg9 = %c0_13 to %c16 step %c1_14 iter_args(%arg10 = %subview_12) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_16 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) -# CHECK-NEXT: %subview_17 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_16, %subview_17 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %subview_15 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_15 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_11 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %2, %subview_6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_3 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_3 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) +# CHECK-NEXT: %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_3, %subview_4 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_0 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x8x8x16xf32> -# CHECK-NEXT: } {"./b"} -# CHECK-NEXT: %c0_1 = arith.constant 0 : index -# CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: %c1_3 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) { -# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_6 = arith.constant 0 : index -# CHECK-NEXT: %c8 = arith.constant 8 : index -# CHECK-NEXT: %c1_7 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %subview_5) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_9 = memref.subview %subview[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_10 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> -# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c8_13 = arith.constant 8 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg7 = %c0_12 to %c8_13 step %c1_14 iter_args(%arg8 = %subview_11) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_16 = memref.subview %subview_9[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_17 = memref.subview %subview_10[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> -# CHECK-NEXT: %subview_18 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_19 = arith.constant 0 : index -# CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_20 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg9 = %c0_19 to %c16 step %c1_20 iter_args(%arg10 = %subview_18) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_22 = memref.subview %subview_16[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_23 = memref.subview %subview_17[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_24 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_25 = arith.constant 0 : index -# CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_26 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg11 = %c0_25 to %c3 step %c1_26 iter_args(%arg12 = %subview_24) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_28 = memref.subview %subview_22[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_29 = memref.subview %subview_23[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_31 = arith.constant 0 : index -# CHECK-NEXT: %c3_32 = arith.constant 3 : index -# CHECK-NEXT: %c1_33 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg13 = %c0_31 to %c3_32 step %c1_33 iter_args(%arg14 = %subview_30) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_35 = memref.subview %subview_28[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_36 = memref.subview %subview_29[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_37 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_38 = arith.constant 0 : index -# CHECK-NEXT: %c3_39 = arith.constant 3 : index -# CHECK-NEXT: %c1_40 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg15 = %c0_38 to %c3_39 step %c1_40 iter_args(%arg16 = %subview_37) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_42 = memref.subview %subview_35[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_43 = memref.subview %subview_36[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_44 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_42, %subview_43 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_46: f32, %out: f32): -# CHECK-NEXT: %8 = arith.mulf %in, %in_46 : f32 -# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 -# CHECK-NEXT: linalg.yield %9 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_45 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_44, %subview_45 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %subview_41 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %7, %subview_41 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[0, %arg3, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %subview[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg1[0, 0, 0, %arg7] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %subview_6) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %subview_2[0, %arg9, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_5[%arg9, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_10 = memref.subview %subview_8[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_9[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %subview_10[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_12, %subview_13 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_14: f32, %out: f32): +# CHECK-NEXT: %7 = arith.mulf %in, %in_14 : f32 +# CHECK-NEXT: %8 = arith.addf %out, %7 : f32 +# CHECK-NEXT: linalg.yield %8 : f32 +# CHECK-NEXT: } # CHECK-NEXT: scf.yield %arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./s"} -# CHECK-NEXT: %subview_34 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %6, %subview_34 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg12 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./r"} -# CHECK-NEXT: %subview_27 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_27 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %subview_21 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_21 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_15 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_15 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %2, %subview_8 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: scf.yield %6 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %5 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_7 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_4 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x8x8x16xf32> -# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: } {"./h"} # CHECK-NEXT: memref.copy %1, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py index e9e9a91c1..a363b4e14 100644 --- a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py @@ -503,9 +503,6 @@ # CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: memref<1x230x230x3xf32> {llvm.noalias}, %arg1: memref<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { -# CHECK-NEXT: %c6 = arith.constant 6 : index -# CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c2 = arith.constant 2 : index # CHECK-NEXT: %c7 = arith.constant 7 : index # CHECK-NEXT: %c16 = arith.constant 16 : index # CHECK-NEXT: %c4 = arith.constant 4 : index @@ -514,197 +511,185 @@ # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) { -# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %subview_3 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %subview_3) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %subview_5 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>) -# CHECK-NEXT: %subview_6 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %subview_4 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %2, %subview_0 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) { +# CHECK-NEXT: %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c64 step %c1 iter_args(%arg8 = %subview_2) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>) +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_4, %subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_3 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x112x112x64xf32> -# CHECK-NEXT: } {"./b"} -# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) { -# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %3 = affine.apply #map(%arg5) -# CHECK-NEXT: %subview_2 = memref.subview %subview[0, %3, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %subview_3) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %5 = affine.apply #map(%arg7) -# CHECK-NEXT: %subview_5 = memref.subview %subview_2[0, 0, %5, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %6 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %subview_6) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %subview_8 = memref.subview %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %subview_9) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %subview_11 = memref.subview %subview_5[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_12 = memref.subview %subview_8[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> -# CHECK-NEXT: %8 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { -# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %subview_14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_17 = memref.subview %subview_15[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_18 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_17, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_18 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_19 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_18, %subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_20 = memref.subview %subview_15[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_21 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_20, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_21 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_22 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_21, %subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_23 = memref.subview %subview_15[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_24 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_23, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_24 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_25 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_24, %subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_26 = memref.subview %subview_15[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_27 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_26, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_27 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_28 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_27, %subview_28 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_29 = memref.subview %subview_13[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_30 = memref.subview %subview_14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_31 = memref.subview %subview_29[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_32 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_31, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_32 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_33 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_32, %subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_34 = memref.subview %subview_29[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_35 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_34, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_35 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_36 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_35, %subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_37 = memref.subview %subview_29[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_38 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_39 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_38, %subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_40 = memref.subview %subview_29[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_41 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_41 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_42 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_41, %subview_42 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_43 = memref.subview %subview_13[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_44 = memref.subview %subview_14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_45 = memref.subview %subview_43[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_46 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_45, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_46 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_47 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_46, %subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_48 = memref.subview %subview_43[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_49 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_48, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_49 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_50 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_49, %subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_51 = memref.subview %subview_43[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_52 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_51, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_52 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_53 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_52, %subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: %subview_54 = memref.subview %subview_43[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_55 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_54, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_55 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_57: f32, %out: f32): -# CHECK-NEXT: %9 = arith.mulf %in, %in_57 : f32 -# CHECK-NEXT: %10 = arith.addf %out, %9 : f32 -# CHECK-NEXT: linalg.yield %10 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_56 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_55, %subview_56 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg14 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./s"} -# CHECK-NEXT: scf.yield %8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./r"} -# CHECK-NEXT: %subview_10 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %7, %subview_10 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %subview_7 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %6, %subview_7 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_1 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> -# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview = memref.subview %arg0[0, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) { +# CHECK-NEXT: %2 = affine.apply #map(%arg3) +# CHECK-NEXT: %subview_0 = memref.subview %subview[0, %2, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c4 iter_args(%arg6 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %subview_3 = memref.subview %subview_0[0, 0, %4, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c64 step %c16 iter_args(%arg8 = %subview_4) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg1[0, 0, 0, %arg7] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %subview_7) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_9 = memref.subview %subview_3[0, %arg9, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_6[%arg9, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_11 = memref.subview %subview_9[0, 0, %arg11, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_10[0, %arg11, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, 0, 0, 0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_15, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_16 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_17 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_16, %subview_17 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_18 = memref.subview %subview_13[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_19 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_18, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_20 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_19, %subview_20 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_21 = memref.subview %subview_13[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_21, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_23 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_22, %subview_23 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_24 = memref.subview %subview_13[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_25 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_24, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_26 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_25, %subview_26 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_27 = memref.subview %subview_11[0, 0, 0, 1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_28 = memref.subview %subview_12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_27[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_29, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_30 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_31 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_30, %subview_31 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_32 = memref.subview %subview_27[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_33 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_32, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_34 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_33, %subview_34 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_35 = memref.subview %subview_27[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_36 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_35, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_37 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_36, %subview_37 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_38 = memref.subview %subview_27[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_39 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_38, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_40 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_39, %subview_40 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_41 = memref.subview %subview_11[0, 0, 0, 2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_42 = memref.subview %subview_12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_43 = memref.subview %subview_41[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_44 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_45 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_44, %subview_45 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_46 = memref.subview %subview_41[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_47 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_48 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_47, %subview_48 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_49 = memref.subview %subview_41[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_50 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_49, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_51 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_50, %subview_51 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_52 = memref.subview %subview_41[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_53 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_52, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_54 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_53, %subview_54 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg12 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %7 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %6, %subview_8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_5 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x112x112x64xf32> -# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: } {"./h"} # CHECK-NEXT: memref.copy %1, %arg2 : memref<1x112x112x64xf32> to memref<1x112x112x64xf32> # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py index b240b6bbd..dd676fa28 100644 --- a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py @@ -181,58 +181,43 @@ # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0 = arith.constant 0 : index -# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index # CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_3 = arith.constant 0 : index -# CHECK-NEXT: %c32 = arith.constant 32 : index -# CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %2, %subview_5 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_0 = arith.constant 0 : index -# CHECK-NEXT: %c4_1 = arith.constant 4 : index -# CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %0) -> (memref<4x32xf32>) { +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_3 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_5 = arith.constant 0 : index -# CHECK-NEXT: %c32 = arith.constant 32 : index -# CHECK-NEXT: %c1_6 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %subview_4) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_8 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %subview_3[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_11 = arith.constant 0 : index -# CHECK-NEXT: %c512 = arith.constant 512 : index -# CHECK-NEXT: %c1_12 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg7 = %c0_11 to %c512 step %c1_12 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_14 = memref.subview %subview_8[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_14, %subview_15 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_16 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_17 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_16, %subview_17 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %arg1[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c512 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_2[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>) # CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %subview_13 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_7 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %2, %subview_7 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: memref.copy %1, %arg2 : memref<4x32xf32> to memref<4x32xf32> diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py index 04095b436..b83522853 100644 --- a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py @@ -38,8 +38,8 @@ print(f"CODE: {res}") # CHECK: // -----// IR Dump Before transform //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> -# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> @@ -47,16 +47,13 @@ # CHECK-NEXT: %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> # CHECK-NEXT: %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> # CHECK-NEXT: %3 = tensor.empty() : tensor<4x32xf32> -# CHECK-NEXT: %collapsed = tensor.collapse_shape %2 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> -# CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %5 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapsed, %cst_0 : tensor<128xf32>, f32) outs(%4 : tensor<128xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: %4 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %cst_0 : tensor<4x32xf32>, f32) outs(%3 : tensor<4x32xf32>) attrs = {__xtc_id_relu_} { # CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): -# CHECK-NEXT: %6 = arith.maximumf %in, %in_1 : f32 -# CHECK-NEXT: linalg.yield %6 : f32 -# CHECK-NEXT: } -> tensor<128xf32> -# CHECK-NEXT: %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32> -# CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: %5 = arith.maximumf %in, %in_1 : f32 +# CHECK-NEXT: linalg.yield %5 : f32 +# CHECK-NEXT: } -> tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { @@ -97,8 +94,8 @@ # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After transform //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> -# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> @@ -115,82 +112,80 @@ # CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { # CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { -# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> -# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> -# CHECK-NEXT: scf.yield %inserted_slice_4 : tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x32xf32> # CHECK-NEXT: } {"./j"} # CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> # CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) { # CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32> -# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) { -# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> -# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> -# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) { -# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> -# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> -# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> -# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> -# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> +# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> # CHECK-NEXT: %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32> # CHECK-NEXT: %13 = vector.broadcast %12 : f32 to vector<16xf32> # CHECK-NEXT: %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %15 = vector.fma %13, %11, %14 : vector<16xf32> # CHECK-NEXT: %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32> -# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> -# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> -# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> -# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> -# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> # CHECK-NEXT: %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32> # CHECK-NEXT: %23 = vector.broadcast %22 : f32 to vector<16xf32> # CHECK-NEXT: %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %25 = vector.fma %23, %21, %24 : vector<16xf32> # CHECK-NEXT: %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32> -# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> -# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> -# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> -# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<2x32xf32> +# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<2x32xf32> # CHECK-NEXT: } {"./j"} # CHECK-NEXT: %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32> # CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: scf.yield %6 : tensor<4x32xf32> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> -# CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<4x32xf32> # CHECK-NEXT: %c0_1 = arith.constant 0 : index -# CHECK-NEXT: %c128 = arith.constant 128 : index -# CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) { -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> -# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> -# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs = {__xtc_id_relu_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_4: f32, %out: f32): -# CHECK-NEXT: %7 = arith.maximumf %in, %in_4 : f32 +# CHECK-NEXT: %c4_2 = arith.constant 4 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_5: f32, %out: f32): +# CHECK-NEXT: %7 = arith.maximumf %in, %in_5 : f32 # CHECK-NEXT: linalg.yield %7 : f32 -# CHECK-NEXT: } -> tensor<1xf32> -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32> -# CHECK-NEXT: scf.yield %inserted_slice : tensor<128xf32> +# CHECK-NEXT: } -> tensor<1x32xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32> -# CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> -# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { # CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> @@ -207,85 +202,82 @@ # CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { # CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { -# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> -# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> -# CHECK-NEXT: scf.yield %inserted_slice_4 : tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x32xf32> # CHECK-NEXT: } {"./j"} # CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> # CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) { # CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32> -# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> # CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) { -# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> -# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> -# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) { -# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> -# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> -# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> -# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> -# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> +# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> # CHECK-NEXT: %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32> # CHECK-NEXT: %13 = vector.broadcast %12 : f32 to vector<16xf32> # CHECK-NEXT: %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %15 = vector.fma %13, %11, %14 : vector<16xf32> # CHECK-NEXT: %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32> -# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> -# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> -# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> -# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> -# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> # CHECK-NEXT: %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32> # CHECK-NEXT: %23 = vector.broadcast %22 : f32 to vector<16xf32> # CHECK-NEXT: %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %25 = vector.fma %23, %21, %24 : vector<16xf32> # CHECK-NEXT: %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32> -# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> -# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> -# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> -# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<2x32xf32> +# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<2x32xf32> # CHECK-NEXT: } {"./j"} # CHECK-NEXT: %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32> # CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: scf.yield %6 : tensor<4x32xf32> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32> -# CHECK-NEXT: %4 = tensor.empty() : tensor<128xf32> +# CHECK-NEXT: %4 = tensor.empty() : tensor<4x32xf32> # CHECK-NEXT: %c0_1 = arith.constant 0 : index -# CHECK-NEXT: %c128 = arith.constant 128 : index -# CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) { -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> -# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32> -# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs = {__xtc_id_relu_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_4: f32, %out: f32): -# CHECK-NEXT: %7 = arith.maximumf %in, %in_4 : f32 +# CHECK-NEXT: %c4_2 = arith.constant 4 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_5: f32, %out: f32): +# CHECK-NEXT: %7 = arith.maximumf %in, %in_5 : f32 # CHECK-NEXT: linalg.yield %7 : f32 -# CHECK-NEXT: } -> tensor<1xf32> -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32> -# CHECK-NEXT: scf.yield %inserted_slice : tensor<128xf32> +# CHECK-NEXT: } -> tensor<1x32xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32> -# CHECK-NEXT: bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0)> -# CHECK-NEXT: #map1 = affine_map<(d0) -> ()> +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> # CHECK-NEXT: %0 = ub.poison : f32 # CHECK-NEXT: %c16 = arith.constant 16 : index # CHECK-NEXT: %c2 = arith.constant 2 : index @@ -294,87 +286,79 @@ # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c0 = arith.constant 0 : index -# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %alloca) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_5 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %1) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_4 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (memref<4x32xf32>) { -# CHECK-NEXT: %subview_5 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_6 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_6) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_8 = memref.subview %subview_4[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_10 = memref.subview %subview_5[%c0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_11 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %6 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> -# CHECK-NEXT: %7 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> -# CHECK-NEXT: %8 = vector.transfer_read %subview_11[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %subview_1 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_2) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_4 = memref.subview %subview_0[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_1[0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %6 = vector.transfer_read %subview_6[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> +# CHECK-NEXT: %7 = vector.transfer_read %subview_4[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %subview_7[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> # CHECK-NEXT: %9 = vector.extract %7[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %10 = vector.extract %6[0, 0] : f32 from vector<1x1xf32> # CHECK-NEXT: %11 = vector.broadcast %10 : f32 to vector<16xf32> # CHECK-NEXT: %12 = vector.extract %8[0] : vector<16xf32> from vector<1x16xf32> # CHECK-NEXT: %13 = vector.fma %11, %9, %12 : vector<16xf32> -# CHECK-NEXT: %14 = vector.insert %13, %cst [0] : vector<16xf32> into vector<1x16xf32> -# CHECK-NEXT: vector.transfer_write %14, %subview_11[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_12 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_11, %subview_12 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_13 = memref.subview %subview_5[%c1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_14 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %15 = vector.transfer_read %subview_13[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> -# CHECK-NEXT: %16 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> -# CHECK-NEXT: %17 = vector.transfer_read %subview_14[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> -# CHECK-NEXT: %18 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32> -# CHECK-NEXT: %19 = vector.extract %15[0, 0] : f32 from vector<1x1xf32> -# CHECK-NEXT: %20 = vector.broadcast %19 : f32 to vector<16xf32> -# CHECK-NEXT: %21 = vector.extract %17[0] : vector<16xf32> from vector<1x16xf32> -# CHECK-NEXT: %22 = vector.fma %20, %18, %21 : vector<16xf32> -# CHECK-NEXT: %23 = vector.insert %22, %cst [0] : vector<16xf32> into vector<1x16xf32> -# CHECK-NEXT: vector.transfer_write %23, %subview_14[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_14, %subview_15 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_9, %subview_16 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %14 = vector.broadcast %13 : vector<16xf32> to vector<1x16xf32> +# CHECK-NEXT: vector.transfer_write %14, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_7, %subview_8 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_1[1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %15 = vector.transfer_read %subview_9[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> +# CHECK-NEXT: %16 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %17 = vector.extract %15[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %18 = vector.broadcast %17 : f32 to vector<16xf32> +# CHECK-NEXT: %19 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %20 = vector.fma %18, %9, %19 : vector<16xf32> +# CHECK-NEXT: %21 = vector.broadcast %20 : vector<16xf32> to vector<1x16xf32> +# CHECK-NEXT: vector.transfer_write %21, %subview_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_10, %subview_11 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_12 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg8 : memref<2x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_7 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_7 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_3 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: scf.yield %4 : memref<4x32xf32> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %collapse_shape = memref.collapse_shape %2 [[0, 1]] : memref<4x32xf32> into memref<128xf32> -# CHECK-NEXT: %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<128xf32> -# CHECK-NEXT: %c0_2 = arith.constant 0 : index -# CHECK-NEXT: %c128 = arith.constant 128 : index -# CHECK-NEXT: %c1_3 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg3 = %c0_2 to %c128 step %c1_3 iter_args(%arg4 = %alloca_1) -> (memref<128xf32>) { -# CHECK-NEXT: %subview = memref.subview %collapse_shape[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> -# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_0 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_4 : memref<1xf32, strided<[1], offset: ?>>) attrs = {__xtc_id_relu_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_6: f32, %out: f32): -# CHECK-NEXT: %4 = arith.maximumf %in, %in_6 : f32 +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%subview, %cst : memref<1x32xf32, strided<[32, 1], offset: ?>>, f32) outs(%subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): +# CHECK-NEXT: %4 = arith.maximumf %in, %in_2 : f32 # CHECK-NEXT: linalg.yield %4 : f32 # CHECK-NEXT: } -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_4, %subview_5 : memref<1xf32, strided<[1], offset: ?>> to memref<1xf32, strided<[1], offset: ?>> -# CHECK-NEXT: scf.yield %arg4 : memref<128xf32> +# CHECK-NEXT: %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_0, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %expand_shape = memref.expand_shape %3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32> -# CHECK-NEXT: memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: memref.copy %3, %arg2 : memref<4x32xf32> to memref<4x32xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py index 255c463df..d9c94661b 100644 --- a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py @@ -426,170 +426,100 @@ # CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: %c1_0 = arith.constant 1 : index -# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) { -# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c12 = arith.constant 12 : index -# CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg5 = %c0_9 to %c12 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { -# CHECK-NEXT: %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c12_14 = arith.constant 12 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg7 = %c0_13 to %c12_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { -# CHECK-NEXT: %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %c0_18 = arith.constant 0 : index -# CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_19 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c3 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { -# CHECK-NEXT: %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>) -# CHECK-NEXT: %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_16 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_11 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c12 step %c1 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>) +# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x12x12x3xf32> -# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: } {"./h"} # CHECK-NEXT: %subview = memref.subview %0[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> # CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> -# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_2 = arith.constant 0 : index -# CHECK-NEXT: %c1_3 = arith.constant 1 : index -# CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) { -# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c4 = arith.constant 4 : index -# CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg5 = %c0_9 to %c4 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c4_14 = arith.constant 4 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg7 = %c0_13 to %c4_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_18 = arith.constant 0 : index -# CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_19 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) -# CHECK-NEXT: %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_16 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_11 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) +# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> -# CHECK-NEXT: } {"./b"} -# CHECK-NEXT: %c0_5 = arith.constant 0 : index -# CHECK-NEXT: %c1_6 = arith.constant 1 : index -# CHECK-NEXT: %c1_7 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) { -# CHECK-NEXT: %subview_8 = memref.subview %0[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_11 = arith.constant 0 : index -# CHECK-NEXT: %c4 = arith.constant 4 : index -# CHECK-NEXT: %c1_12 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg5 = %c0_11 to %c4 step %c1_12 iter_args(%arg6 = %subview_10) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %4 = affine.apply #map(%arg5) -# CHECK-NEXT: %subview_14 = memref.subview %subview_8[0, %4, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> -# CHECK-NEXT: %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_17 = arith.constant 0 : index -# CHECK-NEXT: %c4_18 = arith.constant 4 : index -# CHECK-NEXT: %c1_19 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg7 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg8 = %subview_16) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %6 = affine.apply #map(%arg7) -# CHECK-NEXT: %subview_21 = memref.subview %subview_14[0, 0, %6, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_22 = memref.subview %subview_15[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> -# CHECK-NEXT: %subview_23 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_24 = arith.constant 0 : index -# CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_25 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg9 = %c0_24 to %c16 step %c1_25 iter_args(%arg10 = %subview_23) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_27 = memref.subview %subview_21[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_28 = memref.subview %subview_22[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_29 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_30 = arith.constant 0 : index -# CHECK-NEXT: %c5 = arith.constant 5 : index -# CHECK-NEXT: %c1_31 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg11 = %c0_30 to %c5 step %c1_31 iter_args(%arg12 = %subview_29) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_33 = memref.subview %subview_27[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_34 = memref.subview %subview_28[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_35 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_36 = arith.constant 0 : index -# CHECK-NEXT: %c5_37 = arith.constant 5 : index -# CHECK-NEXT: %c1_38 = arith.constant 1 : index -# CHECK-NEXT: %9 = scf.for %arg13 = %c0_36 to %c5_37 step %c1_38 iter_args(%arg14 = %subview_35) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_40 = memref.subview %subview_33[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_41 = memref.subview %subview_34[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_42 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: %c0_43 = arith.constant 0 : index -# CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_44 = arith.constant 1 : index -# CHECK-NEXT: %10 = scf.for %arg15 = %c0_43 to %c3 step %c1_44 iter_args(%arg16 = %subview_42) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_46 = memref.subview %subview_40[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %subview_47 = memref.subview %subview_41[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: %subview_48 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_47 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_48 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_50: f32, %out: f32): -# CHECK-NEXT: %11 = arith.mulf %in, %in_50 : f32 -# CHECK-NEXT: %12 = arith.addf %out, %11 : f32 -# CHECK-NEXT: linalg.yield %12 : f32 -# CHECK-NEXT: } -# CHECK-NEXT: %subview_49 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_48, %subview_49 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg16 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %subview_45 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %10, %subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview_0 = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) { +# CHECK-NEXT: %3 = affine.apply #map(%arg3) +# CHECK-NEXT: %subview_1 = memref.subview %subview_0[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_2) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %5 = affine.apply #map(%arg5) +# CHECK-NEXT: %subview_4 = memref.subview %subview_1[0, 0, %5, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_5) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_7 = memref.subview %arg1[0, 0, 0, %arg7] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %7 = scf.for %arg9 = %c0 to %c5 step %c1 iter_args(%arg10 = %subview_8) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_10 = memref.subview %subview_4[0, %arg9, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_7[%arg9, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %8 = scf.for %arg11 = %c0 to %c5 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %subview_10[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_16: f32, %out: f32): +# CHECK-NEXT: %10 = arith.mulf %in, %in_16 : f32 +# CHECK-NEXT: %11 = arith.addf %out, %10 : f32 +# CHECK-NEXT: linalg.yield %11 : f32 +# CHECK-NEXT: } # CHECK-NEXT: scf.yield %arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./s"} -# CHECK-NEXT: %subview_39 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %9, %subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg12 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./r"} -# CHECK-NEXT: %subview_32 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %8, %subview_32 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %subview_26 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %7, %subview_26 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_13 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_13 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: scf.yield %9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %8 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_9 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %6, %subview_6 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_3 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> -# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: } {"./h"} # CHECK-NEXT: memref.copy %2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py index b05c8a8d7..e882104ab 100644 --- a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py @@ -286,109 +286,78 @@ # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0 = arith.constant 0 : index -# CHECK-NEXT: %c16 = arith.constant 16 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index -# CHECK-NEXT: %c16_17 = arith.constant 16 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %subview = memref.subview %0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_2 = arith.constant 0 : index -# CHECK-NEXT: %c16_3 = arith.constant 16 : index -# CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index -# CHECK-NEXT: %c16_17 = arith.constant 16 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_1) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview_5 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg1, %subview_5 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %cst_7 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_8 = arith.constant 0 : index -# CHECK-NEXT: %c16_9 = arith.constant 16 : index -# CHECK-NEXT: %c1_10 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %alloca_6) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index -# CHECK-NEXT: %c16_17 = arith.constant 16 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_11 = arith.constant 0 : index -# CHECK-NEXT: %c16_12 = arith.constant 16 : index -# CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 iter_args(%arg4 = %2) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_15 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %1[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>> -# CHECK-NEXT: %subview_17 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %c0_18 = arith.constant 0 : index -# CHECK-NEXT: %c16_19 = arith.constant 16 : index -# CHECK-NEXT: %c1_20 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg5 = %c0_18 to %c16_19 step %c1_20 iter_args(%arg6 = %subview_17) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_22 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_23 = memref.subview %subview_16[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_24 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %c0_25 = arith.constant 0 : index -# CHECK-NEXT: %c16_26 = arith.constant 16 : index -# CHECK-NEXT: %c1_27 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg7 = %c0_25 to %c16_26 step %c1_27 iter_args(%arg8 = %subview_24) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_29 = memref.subview %subview_22[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_30 = memref.subview %subview_23[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_31 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_29, %subview_30 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_31 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_32 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_31, %subview_32 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %2) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_4 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_5) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_7 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_8) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_10 = memref.subview %subview_4[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_7[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_10, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>) # CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %subview_28 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_21 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_21 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_6 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview_14 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %subview_14, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> +# CHECK-NEXT: %subview_3 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %subview_3, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py index 7a9eb3442..ac4cee48b 100644 --- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -299,113 +299,77 @@ # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { -# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> -# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0 = arith.constant 0 : index -# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index # CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> # CHECK-NEXT: %0 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %alloca) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c32_10 = arith.constant 32 : index -# CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg5 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_0 = arith.constant 0 : index -# CHECK-NEXT: %c4_1 = arith.constant 4 : index -# CHECK-NEXT: %c1_2 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %0) -> (memref<4x32xf32>) { +# CHECK-NEXT: %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (memref<4x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_11 = arith.constant 0 : index -# CHECK-NEXT: %c32_12 = arith.constant 32 : index -# CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_15 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %subview_9[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_18 = arith.constant 0 : index -# CHECK-NEXT: %c512 = arith.constant 512 : index -# CHECK-NEXT: %c1_19 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg8 = %c0_18 to %c512 step %c1_19 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_21 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> -# CHECK-NEXT: %subview_22 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_23 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%subview_21, %subview_22 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_23 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_23, %subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %arg1[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg8 = %c0 to %c512 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>) # CHECK-NEXT: scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %subview_20 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_20 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg5 : memref<4x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_4 = arith.constant 0 : index -# CHECK-NEXT: %c32 = arith.constant 32 : index -# CHECK-NEXT: %c1_5 = arith.constant 1 : index -# CHECK-NEXT: %2 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) { +# CHECK-NEXT: %2 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c32_10 = arith.constant 32 : index -# CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg5 : memref<32x32xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_6 = arith.constant 0 : index -# CHECK-NEXT: %c32_7 = arith.constant 32 : index -# CHECK-NEXT: %c1_8 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %2) -> (memref<32x32xf32>) { +# CHECK-NEXT: %3 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %2) -> (memref<32x32xf32>) { # CHECK-NEXT: %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_9 = memref.subview %1[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>> -# CHECK-NEXT: %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_11 = arith.constant 0 : index -# CHECK-NEXT: %c32_12 = arith.constant 32 : index -# CHECK-NEXT: %c1_13 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_15 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_16 = memref.subview %subview_9[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %c0_18 = arith.constant 0 : index -# CHECK-NEXT: %c4_19 = arith.constant 4 : index -# CHECK-NEXT: %c1_20 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg8 = %c0_18 to %c4_19 step %c1_20 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { -# CHECK-NEXT: %subview_22 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>> -# CHECK-NEXT: %subview_23 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%subview_22, %subview_23 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>>) -# CHECK-NEXT: %subview_25 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_24, %subview_25 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %1[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32> to memref<4x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>) # CHECK-NEXT: scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %subview_21 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg5 : memref<32x32xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: memref.copy %3, %arg3 : memref<32x32xf32> to memref<32x32xf32> From 036a18e79f61e8cfaa0b90f78f542a90f213273b Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Tue, 24 Feb 2026 12:12:30 +0100 Subject: [PATCH 13/14] tensor-dialect: removed macos workaround --- src/xtc/backends/mlir/MlirCompilerPasses.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index 804adce6b..08835ade2 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -26,7 +26,6 @@ OpResult, ) from mlir.passmanager import PassManager -import platform # Import SDist if available try: @@ -557,10 +556,8 @@ def apply_bufferization_passes(mlir_program: RawMlirProgram): bufferize_options = [ "bufferize-function-boundaries=1", "function-boundary-type-conversion=identity-layout-map", + "buffer-alignment=256", ] - # TODO: below is needed until macos mlir is updated - if platform.system() != "Darwin": - bufferize_options.append("buffer-alignment=256") apply_passes.run( [ "canonicalize", From 0bc7d2ab9a1986f212ddd77e41db8853602d0c46 Mon Sep 17 00:00:00 2001 From: Liam Semeria Date: Wed, 4 Mar 2026 14:51:35 +0100 Subject: [PATCH 14/14] tensor-dialect: updated to tensor.pad, doesnt work for c :( --- src/xtc/backends/mlir/MlirOps.py | 44 +- .../test_pad_conv2d_mlir_tensor.py | 923 +++++++++++++----- .../test_pad_matmul_unpad_mlir_tensor.py | 759 ++++++++++---- 3 files changed, 1232 insertions(+), 494 deletions(-) diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index 836f7d2e7..48e0a2f19 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -11,6 +11,7 @@ from xdsl.dialects.builtin import ( MemRefType, TensorType, + IndexType, f32, f64, i64, @@ -529,19 +530,25 @@ def generate_op( dims_value = list(self.args[:-1]) padding = self.attrs["padding"] constant_value = self.attrs["constant_value"] + lows = [0] * len(dims_value) + highs = [0] * len(dims_value) if isinstance(padding, dict): dims_value_before_pad = list(dims_value) for i, pad_value in padding.items(): dims_value_before_pad[i] -= sum(pad_value) + lows[i] = pad_value[0] + highs[i] = pad_value[1] else: dims_value_before_pad = [ dim_value - sum(padding) for dim_value in dims_value ] + lows = [padding[0] for d in dims_value] + highs = [padding[1] for d in dims_value] elt_type = {"float32": f32, "float64": f64}[dtype] elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) + MemRefType(elt_type, shape) # should be op_type here?? for shape in [dims_value_before_pad, dims_value] ] block = Block(arg_types=ops_types) @@ -560,20 +567,28 @@ def generate_op( with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size)) result = (args[1].type,) if using_tensors else () - fill = linalg.FillOp( - res=result, - inputs=(cst0.results[0],), - outputs=(args[1],), - ) + fill_node_id = f"{self.name}_0" if using_tensors: - copy = tensor.InsertSliceOp.from_static_parameters( + fill = None + block_in = Block(arg_types=[IndexType()] * len(dims_value)) + with ImplicitBuilder(block_in): + tensor.YieldOp(cst0) + copy = tensor.PadOp( source=args[0], - dest=fill.results[0], - offsets=offsets, - sizes=sizes, - strides=strides, + region=Region([block_in]), + low=[], + high=[], + nofold=UnitAttr(), + result_type=TensorType(elt_type, dims_value), + static_low=lows, + static_high=highs, ) else: + fill = linalg.FillOp( + res=result, + inputs=(cst0.results[0],), + outputs=(args[1],), + ) subview = memref.SubviewOp.from_static_parameters( source=args[1], source_type=args[1].type, # type: ignore @@ -586,14 +601,13 @@ def generate_op( outputs=[subview.result], res=result, ) - fill_node_id = f"{self.name}_0" - fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() + fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() copy_node_id = f"{self.name}" copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr() attrs = { "nodes_map": { - fill_node_id: fill, - copy_node_id: None if using_tensors else copy, + **({fill_node_id: fill} if fill else {}), + copy_node_id: copy, }, "dims_sizes": [ self.dims_sizes(), diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py index d9c94661b..2ebcefb39 100644 --- a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py @@ -41,18 +41,20 @@ # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %1 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%0 : tensor<1x12x12x3xf32>) -> tensor<1x12x12x3xf32> -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %padded = tensor.pad %arg0 nofold low[0, 2, 2, 0] high[0, 2, 2, 0] { +# CHECK-NEXT: ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_pad_} : tensor<1x8x8x3xf32> to tensor<1x12x12x3xf32> +# CHECK-NEXT: %1 = tensor.empty() : tensor<1x4x4x16xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %3 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%2 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> -# CHECK-NEXT: %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%inserted_slice, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: %2 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%1 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> +# CHECK-NEXT: %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%padded, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%2 : tensor<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { # CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): -# CHECK-NEXT: %5 = arith.mulf %in, %in_1 : f32 -# CHECK-NEXT: %6 = arith.addf %out, %5 : f32 -# CHECK-NEXT: linalg.yield %6 : f32 +# CHECK-NEXT: %4 = arith.mulf %in, %in_1 : f32 +# CHECK-NEXT: %5 = arith.addf %out, %4 : f32 +# CHECK-NEXT: linalg.yield %5 : f32 # CHECK-NEXT: } -> tensor<1x4x4x16xf32> -# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { @@ -60,7 +62,7 @@ # CHECK-NEXT: transform.yield # CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { -# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op # CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -98,332 +100,631 @@ # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After transform //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)> +# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)> +# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)> +# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)> +# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)> +# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)> +# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)> +# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)> +# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<1x12x12x3xf32> # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c1_0 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) { -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) { # CHECK-NEXT: %c0_8 = arith.constant 0 : index -# CHECK-NEXT: %c12 = arith.constant 12 : index -# CHECK-NEXT: %c1_9 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) { -# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c12_13 = arith.constant 12 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) { -# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32> -# CHECK-NEXT: %c0_17 = arith.constant 0 : index -# CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) { -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> -# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32> -# CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %6 = arith.cmpi eq, %c8, %c0_8 : index +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c8_10 = arith.constant 8 : index +# CHECK-NEXT: %7 = arith.cmpi eq, %c8_10, %c0_9 : index +# CHECK-NEXT: %8 = arith.ori %7, %6 : i1 +# CHECK-NEXT: %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x12x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32> +# CHECK-NEXT: %10 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %12 = affine.apply #map(%arg5) +# CHECK-NEXT: %13 = affine.max #map1(%arg5) +# CHECK-NEXT: %14 = affine.apply #map2(%arg5) +# CHECK-NEXT: %15 = affine.max #map3(%arg5) +# CHECK-NEXT: %16 = affine.min #map4(%15) +# CHECK-NEXT: %17 = affine.apply #map5(%13) +# CHECK-NEXT: %18 = affine.apply #map6(%16) +# CHECK-NEXT: %19 = affine.min #map7(%16, %13) +# CHECK-NEXT: %20 = affine.max #map8(%19) +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %21 = arith.cmpi eq, %20, %c0_13 : index +# CHECK-NEXT: %22 = affine.apply #map5(%20) +# CHECK-NEXT: %23 = affine.apply #map9(%13, %20) +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c8_15 = arith.constant 8 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %c8_15, %c0_14 : index +# CHECK-NEXT: %25 = arith.ori %24, %21 : i1 +# CHECK-NEXT: %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32> +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %27 = tensor.empty() : tensor<1x1x12x3xf32> +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c12_21 = arith.constant 12 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %29 = affine.max #map10(%13) +# CHECK-NEXT: %30 = affine.apply #map11(%13) +# CHECK-NEXT: %31 = affine.max #map12(%13) +# CHECK-NEXT: %32 = affine.min #map13(%31, %20) +# CHECK-NEXT: %33 = affine.apply #map5(%29) +# CHECK-NEXT: %34 = affine.apply #map14(%20, %32) +# CHECK-NEXT: %35 = affine.min #map15(%20, %32, %29) +# CHECK-NEXT: %36 = affine.max #map8(%35) +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %37 = arith.cmpi eq, %36, %c0_24 : index +# CHECK-NEXT: %38 = affine.apply #map5(%36) +# CHECK-NEXT: %39 = affine.apply #map9(%29, %36) +# CHECK-NEXT: %40 = affine.apply #map(%arg7) +# CHECK-NEXT: %41 = affine.max #map1(%arg7) +# CHECK-NEXT: %42 = affine.apply #map2(%arg7) +# CHECK-NEXT: %43 = affine.max #map3(%arg7) +# CHECK-NEXT: %44 = affine.min #map4(%43) +# CHECK-NEXT: %45 = affine.apply #map5(%41) +# CHECK-NEXT: %46 = affine.apply #map6(%44) +# CHECK-NEXT: %47 = affine.min #map7(%44, %41) +# CHECK-NEXT: %48 = affine.max #map8(%47) +# CHECK-NEXT: %c0_25 = arith.constant 0 : index +# CHECK-NEXT: %49 = arith.cmpi eq, %48, %c0_25 : index +# CHECK-NEXT: %50 = arith.ori %49, %37 : i1 +# CHECK-NEXT: %51 = affine.apply #map5(%48) +# CHECK-NEXT: %52 = affine.apply #map9(%41, %48) +# CHECK-NEXT: %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32> +# CHECK-NEXT: %c1_28 = arith.constant 1 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %54 = tensor.empty() : tensor<1x1x1x3xf32> +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %c2_30 = arith.constant 2 : index +# CHECK-NEXT: %c0_31 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %c1_34 = arith.constant 1 : index +# CHECK-NEXT: %56 = affine.max #map10(%29) +# CHECK-NEXT: %57 = affine.apply #map11(%29) +# CHECK-NEXT: %58 = affine.max #map12(%29) +# CHECK-NEXT: %59 = affine.min #map13(%58, %36) +# CHECK-NEXT: %60 = affine.apply #map5(%56) +# CHECK-NEXT: %61 = affine.apply #map14(%36, %59) +# CHECK-NEXT: %62 = affine.min #map15(%36, %59, %56) +# CHECK-NEXT: %63 = affine.max #map8(%62) +# CHECK-NEXT: %c0_35 = arith.constant 0 : index +# CHECK-NEXT: %64 = arith.cmpi eq, %63, %c0_35 : index +# CHECK-NEXT: %65 = affine.apply #map5(%63) +# CHECK-NEXT: %66 = affine.apply #map9(%56, %63) +# CHECK-NEXT: %c2_36 = arith.constant 2 : index +# CHECK-NEXT: %67 = affine.max #map10(%41) +# CHECK-NEXT: %68 = affine.apply #map11(%41) +# CHECK-NEXT: %69 = affine.max #map12(%41) +# CHECK-NEXT: %70 = affine.min #map13(%69, %48) +# CHECK-NEXT: %71 = affine.apply #map5(%67) +# CHECK-NEXT: %72 = affine.apply #map14(%48, %70) +# CHECK-NEXT: %73 = affine.min #map15(%48, %70, %67) +# CHECK-NEXT: %74 = affine.max #map8(%73) +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %75 = arith.cmpi eq, %74, %c0_37 : index +# CHECK-NEXT: %76 = arith.ori %75, %64 : i1 +# CHECK-NEXT: %77 = affine.apply #map5(%74) +# CHECK-NEXT: %78 = affine.apply #map9(%67, %74) +# CHECK-NEXT: %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %cast_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %cast_33 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x1x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: scf.yield %11 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x12x12x3xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<1x4x4x16xf32> # CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_2 = arith.constant 0 : index # CHECK-NEXT: %c1_3 = arith.constant 1 : index # CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { # CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> # CHECK-NEXT: %c0_8 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1_9 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { -# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c4_13 = arith.constant 4 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) { -# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> -# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c4_12 = arith.constant 4 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) { -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> -# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32> +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32> # CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32> # CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./b"} # CHECK-NEXT: %c0_5 = arith.constant 0 : index # CHECK-NEXT: %c1_6 = arith.constant 1 : index # CHECK-NEXT: %c1_7 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> # CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> # CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> # CHECK-NEXT: %c0_10 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { -# CHECK-NEXT: %6 = affine.apply #map(%arg5) -# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> -# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> -# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index -# CHECK-NEXT: %c4_17 = arith.constant 4 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) { -# CHECK-NEXT: %8 = affine.apply #map(%arg7) -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> -# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> -# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> -# CHECK-NEXT: %c0_23 = arith.constant 0 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %7 = affine.apply #map16(%arg5) +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c4_16 = arith.constant 4 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %9 = affine.apply #map16(%arg7) +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_24 = arith.constant 1 : index -# CHECK-NEXT: %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) { -# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> -# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> -# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %c0_29 = arith.constant 0 : index +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_28 = arith.constant 0 : index # CHECK-NEXT: %c5 = arith.constant 5 : index -# CHECK-NEXT: %c1_30 = arith.constant 1 : index -# CHECK-NEXT: %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) { -# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> -# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> -# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %c0_35 = arith.constant 0 : index -# CHECK-NEXT: %c5_36 = arith.constant 5 : index -# CHECK-NEXT: %c1_37 = arith.constant 1 : index -# CHECK-NEXT: %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) { -# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> -# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> -# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %c0_42 = arith.constant 0 : index +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> +# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_34 = arith.constant 0 : index +# CHECK-NEXT: %c5_35 = arith.constant 5 : index +# CHECK-NEXT: %c1_36 = arith.constant 1 : index +# CHECK-NEXT: %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_41 = arith.constant 0 : index # CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_43 = arith.constant 1 : index -# CHECK-NEXT: %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) { -# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_49: f32, %out: f32): -# CHECK-NEXT: %14 = arith.mulf %in, %in_49 : f32 -# CHECK-NEXT: %15 = arith.addf %out, %14 : f32 -# CHECK-NEXT: linalg.yield %15 : f32 +# CHECK-NEXT: %c1_42 = arith.constant 1 : index +# CHECK-NEXT: %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_48: f32, %out: f32): +# CHECK-NEXT: %15 = arith.mulf %in, %in_48 : f32 +# CHECK-NEXT: %16 = arith.addf %out, %15 : f32 +# CHECK-NEXT: linalg.yield %16 : f32 # CHECK-NEXT: } -> tensor<1x1x1x1xf32> -# CHECK-NEXT: %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32> # CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32> # CHECK-NEXT: } {"./s"} -# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32> # CHECK-NEXT: } {"./r"} -# CHECK-NEXT: %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32> # CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32> # CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)> +# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)> +# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)> +# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)> +# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)> +# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)> +# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)> +# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)> +# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<1x12x12x3xf32> # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c1 = arith.constant 1 : index # CHECK-NEXT: %c1_0 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) { -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) { # CHECK-NEXT: %c0_8 = arith.constant 0 : index -# CHECK-NEXT: %c12 = arith.constant 12 : index -# CHECK-NEXT: %c1_9 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) { -# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c12_13 = arith.constant 12 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) { -# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32> -# CHECK-NEXT: %c0_17 = arith.constant 0 : index -# CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) { -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> -# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32> -# CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32> -# CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %6 = arith.cmpi eq, %c8, %c0_8 : index +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c8_10 = arith.constant 8 : index +# CHECK-NEXT: %7 = arith.cmpi eq, %c8_10, %c0_9 : index +# CHECK-NEXT: %8 = arith.ori %7, %6 : i1 +# CHECK-NEXT: %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x12x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32> +# CHECK-NEXT: %10 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %12 = affine.apply #map(%arg5) +# CHECK-NEXT: %13 = affine.max #map1(%arg5) +# CHECK-NEXT: %14 = affine.apply #map2(%arg5) +# CHECK-NEXT: %15 = affine.max #map3(%arg5) +# CHECK-NEXT: %16 = affine.min #map4(%15) +# CHECK-NEXT: %17 = affine.apply #map5(%13) +# CHECK-NEXT: %18 = affine.apply #map6(%16) +# CHECK-NEXT: %19 = affine.min #map7(%16, %13) +# CHECK-NEXT: %20 = affine.max #map8(%19) +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %21 = arith.cmpi eq, %20, %c0_13 : index +# CHECK-NEXT: %22 = affine.apply #map5(%20) +# CHECK-NEXT: %23 = affine.apply #map9(%13, %20) +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c8_15 = arith.constant 8 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %c8_15, %c0_14 : index +# CHECK-NEXT: %25 = arith.ori %24, %21 : i1 +# CHECK-NEXT: %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32> +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %27 = tensor.empty() : tensor<1x1x12x3xf32> +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c12_21 = arith.constant 12 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %29 = affine.max #map10(%13) +# CHECK-NEXT: %30 = affine.apply #map11(%13) +# CHECK-NEXT: %31 = affine.max #map12(%13) +# CHECK-NEXT: %32 = affine.min #map13(%31, %20) +# CHECK-NEXT: %33 = affine.apply #map5(%29) +# CHECK-NEXT: %34 = affine.apply #map14(%20, %32) +# CHECK-NEXT: %35 = affine.min #map15(%20, %32, %29) +# CHECK-NEXT: %36 = affine.max #map8(%35) +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %37 = arith.cmpi eq, %36, %c0_24 : index +# CHECK-NEXT: %38 = affine.apply #map5(%36) +# CHECK-NEXT: %39 = affine.apply #map9(%29, %36) +# CHECK-NEXT: %40 = affine.apply #map(%arg7) +# CHECK-NEXT: %41 = affine.max #map1(%arg7) +# CHECK-NEXT: %42 = affine.apply #map2(%arg7) +# CHECK-NEXT: %43 = affine.max #map3(%arg7) +# CHECK-NEXT: %44 = affine.min #map4(%43) +# CHECK-NEXT: %45 = affine.apply #map5(%41) +# CHECK-NEXT: %46 = affine.apply #map6(%44) +# CHECK-NEXT: %47 = affine.min #map7(%44, %41) +# CHECK-NEXT: %48 = affine.max #map8(%47) +# CHECK-NEXT: %c0_25 = arith.constant 0 : index +# CHECK-NEXT: %49 = arith.cmpi eq, %48, %c0_25 : index +# CHECK-NEXT: %50 = arith.ori %49, %37 : i1 +# CHECK-NEXT: %51 = affine.apply #map5(%48) +# CHECK-NEXT: %52 = affine.apply #map9(%41, %48) +# CHECK-NEXT: %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32> +# CHECK-NEXT: %c1_28 = arith.constant 1 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %54 = tensor.empty() : tensor<1x1x1x3xf32> +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %c2_30 = arith.constant 2 : index +# CHECK-NEXT: %c0_31 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %c1_34 = arith.constant 1 : index +# CHECK-NEXT: %56 = affine.max #map10(%29) +# CHECK-NEXT: %57 = affine.apply #map11(%29) +# CHECK-NEXT: %58 = affine.max #map12(%29) +# CHECK-NEXT: %59 = affine.min #map13(%58, %36) +# CHECK-NEXT: %60 = affine.apply #map5(%56) +# CHECK-NEXT: %61 = affine.apply #map14(%36, %59) +# CHECK-NEXT: %62 = affine.min #map15(%36, %59, %56) +# CHECK-NEXT: %63 = affine.max #map8(%62) +# CHECK-NEXT: %c0_35 = arith.constant 0 : index +# CHECK-NEXT: %64 = arith.cmpi eq, %63, %c0_35 : index +# CHECK-NEXT: %65 = affine.apply #map5(%63) +# CHECK-NEXT: %66 = affine.apply #map9(%56, %63) +# CHECK-NEXT: %c2_36 = arith.constant 2 : index +# CHECK-NEXT: %67 = affine.max #map10(%41) +# CHECK-NEXT: %68 = affine.apply #map11(%41) +# CHECK-NEXT: %69 = affine.max #map12(%41) +# CHECK-NEXT: %70 = affine.min #map13(%69, %48) +# CHECK-NEXT: %71 = affine.apply #map5(%67) +# CHECK-NEXT: %72 = affine.apply #map14(%48, %70) +# CHECK-NEXT: %73 = affine.min #map15(%48, %70, %67) +# CHECK-NEXT: %74 = affine.max #map8(%73) +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %75 = arith.cmpi eq, %74, %c0_37 : index +# CHECK-NEXT: %76 = arith.ori %75, %64 : i1 +# CHECK-NEXT: %77 = affine.apply #map5(%74) +# CHECK-NEXT: %78 = affine.apply #map9(%67, %74) +# CHECK-NEXT: %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %cast_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %cast_33 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x1x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: scf.yield %11 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x12x12x3xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32> -# CHECK-NEXT: %2 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<1x4x4x16xf32> # CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %c0_2 = arith.constant 0 : index # CHECK-NEXT: %c1_3 = arith.constant 1 : index # CHECK-NEXT: %c1_4 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { # CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> # CHECK-NEXT: %c0_8 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1_9 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { -# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> -# CHECK-NEXT: %c0_12 = arith.constant 0 : index -# CHECK-NEXT: %c4_13 = arith.constant 4 : index -# CHECK-NEXT: %c1_14 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) { -# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> -# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c4_12 = arith.constant 4 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) { -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> -# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32> +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32> # CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32> # CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./b"} # CHECK-NEXT: %c0_5 = arith.constant 0 : index # CHECK-NEXT: %c1_6 = arith.constant 1 : index # CHECK-NEXT: %c1_7 = arith.constant 1 : index -# CHECK-NEXT: %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> # CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> # CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> # CHECK-NEXT: %c0_10 = arith.constant 0 : index # CHECK-NEXT: %c4 = arith.constant 4 : index # CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { -# CHECK-NEXT: %6 = affine.apply #map(%arg5) -# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> -# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> -# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> -# CHECK-NEXT: %c0_16 = arith.constant 0 : index -# CHECK-NEXT: %c4_17 = arith.constant 4 : index -# CHECK-NEXT: %c1_18 = arith.constant 1 : index -# CHECK-NEXT: %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) { -# CHECK-NEXT: %8 = affine.apply #map(%arg7) -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> -# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> -# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> -# CHECK-NEXT: %c0_23 = arith.constant 0 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %7 = affine.apply #map16(%arg5) +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c4_16 = arith.constant 4 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %9 = affine.apply #map16(%arg7) +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index -# CHECK-NEXT: %c1_24 = arith.constant 1 : index -# CHECK-NEXT: %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) { -# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> -# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> -# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %c0_29 = arith.constant 0 : index +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_28 = arith.constant 0 : index # CHECK-NEXT: %c5 = arith.constant 5 : index -# CHECK-NEXT: %c1_30 = arith.constant 1 : index -# CHECK-NEXT: %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) { -# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> -# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> -# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %c0_35 = arith.constant 0 : index -# CHECK-NEXT: %c5_36 = arith.constant 5 : index -# CHECK-NEXT: %c1_37 = arith.constant 1 : index -# CHECK-NEXT: %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) { -# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> -# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> -# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %c0_42 = arith.constant 0 : index +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> +# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_34 = arith.constant 0 : index +# CHECK-NEXT: %c5_35 = arith.constant 5 : index +# CHECK-NEXT: %c1_36 = arith.constant 1 : index +# CHECK-NEXT: %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_41 = arith.constant 0 : index # CHECK-NEXT: %c3 = arith.constant 3 : index -# CHECK-NEXT: %c1_43 = arith.constant 1 : index -# CHECK-NEXT: %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) { -# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> -# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { -# CHECK-NEXT: ^bb0(%in: f32, %in_49: f32, %out: f32): -# CHECK-NEXT: %14 = arith.mulf %in, %in_49 : f32 -# CHECK-NEXT: %15 = arith.addf %out, %14 : f32 -# CHECK-NEXT: linalg.yield %15 : f32 +# CHECK-NEXT: %c1_42 = arith.constant 1 : index +# CHECK-NEXT: %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_48: f32, %out: f32): +# CHECK-NEXT: %15 = arith.mulf %in, %in_48 : f32 +# CHECK-NEXT: %16 = arith.addf %out, %15 : f32 +# CHECK-NEXT: linalg.yield %16 : f32 # CHECK-NEXT: } -> tensor<1x1x1x1xf32> -# CHECK-NEXT: %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32> # CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32> # CHECK-NEXT: } {"./s"} -# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32> # CHECK-NEXT: } {"./r"} -# CHECK-NEXT: %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32> # CHECK-NEXT: } {"./f"} -# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32> # CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> # CHECK-NEXT: } {"./b"} -# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // -# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> -# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> -# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2, 0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, d0 - 2)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (8, d0)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0, 0)> +# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d1, d0)> +# CHECK-NEXT: #map7 = affine_map<(d0, d1, d2) -> (-d2 + 1, d0 - d1)> +# CHECK-NEXT: #map8 = affine_map<(d0, d1) -> (-d0 - d1 + 1)> +# CHECK-NEXT: #map9 = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { # CHECK-NEXT: %c5 = arith.constant 5 : index @@ -435,27 +736,117 @@ # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32> # CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c12 step %c1 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) { +# CHECK-NEXT: %3 = affine.max #map(%arg3) +# CHECK-NEXT: %4 = affine.max #map1(%arg3) +# CHECK-NEXT: %5 = affine.min #map2(%4) +# CHECK-NEXT: %6 = affine.min #map3(%5, %3) +# CHECK-NEXT: %7 = affine.max #map4(%6) +# CHECK-NEXT: %8 = arith.cmpi eq, %7, %c0 : index +# CHECK-NEXT: %9 = scf.if %8 -> (memref<1x1x12x3xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca : memref<1x1x12x3xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %10 = linalg.index 0 : index +# CHECK-NEXT: %11 = linalg.index 1 : index +# CHECK-NEXT: %12 = linalg.index 2 : index +# CHECK-NEXT: %13 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca : memref<1x1x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_2 = memref.subview %arg0[0, %5, 0, 0] [1, %7, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_3, %alloca_0 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32> +# CHECK-NEXT: %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32> +# CHECK-NEXT: %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32> +# CHECK-NEXT: %10 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %alloca_0) -> (memref<1x1x12x3xf32>) { +# CHECK-NEXT: %11 = affine.max #map5(%3) +# CHECK-NEXT: %12 = affine.min #map6(%11, %7) +# CHECK-NEXT: %13 = affine.min #map7(%7, %12, %3) +# CHECK-NEXT: %14 = affine.max #map4(%13) +# CHECK-NEXT: %15 = arith.cmpi eq, %14, %c0 : index +# CHECK-NEXT: %16 = affine.max #map(%arg5) +# CHECK-NEXT: %17 = affine.max #map1(%arg5) +# CHECK-NEXT: %18 = affine.min #map2(%17) +# CHECK-NEXT: %19 = affine.min #map3(%18, %16) +# CHECK-NEXT: %20 = affine.max #map4(%19) +# CHECK-NEXT: %21 = arith.cmpi eq, %20, %c0 : index +# CHECK-NEXT: %22 = arith.ori %21, %15 : i1 +# CHECK-NEXT: %23 = scf.if %22 -> (memref<1x1x1x3xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_4 : memref<1x1x1x3xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %24 = linalg.index 0 : index +# CHECK-NEXT: %25 = linalg.index 1 : index +# CHECK-NEXT: %26 = linalg.index 2 : index +# CHECK-NEXT: %27 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_4 : memref<1x1x1x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_7 = memref.subview %subview_2[0, %12, %18, 0] [1, %14, %20, 3] [1, 1, 1, 1] : memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %alloca_5 : memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32> +# CHECK-NEXT: %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32> +# CHECK-NEXT: %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32> +# CHECK-NEXT: %24 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %alloca_5) -> (memref<1x1x1x3xf32>) { +# CHECK-NEXT: %25 = affine.min #map6(%11, %14) +# CHECK-NEXT: %26 = affine.min #map7(%14, %25, %3) +# CHECK-NEXT: %27 = affine.max #map4(%26) +# CHECK-NEXT: %28 = arith.cmpi eq, %27, %c0 : index +# CHECK-NEXT: %29 = affine.apply #map8(%3, %27) +# CHECK-NEXT: %30 = affine.max #map5(%16) +# CHECK-NEXT: %31 = affine.min #map6(%30, %20) +# CHECK-NEXT: %32 = affine.min #map7(%20, %31, %16) +# CHECK-NEXT: %33 = affine.max #map4(%32) +# CHECK-NEXT: %34 = arith.cmpi eq, %33, %c0 : index +# CHECK-NEXT: %35 = arith.ori %34, %28 : i1 +# CHECK-NEXT: %36 = affine.apply #map8(%16, %33) +# CHECK-NEXT: %37 = scf.if %35 -> (memref<1x1x1x1xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_9 : memref<1x1x1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %38 = linalg.index 0 : index +# CHECK-NEXT: %39 = linalg.index 1 : index +# CHECK-NEXT: %40 = linalg.index 2 : index +# CHECK-NEXT: %41 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_9 : memref<1x1x1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[0, %25, %31, %arg7] [1, %27, %33, 1] [1, 1, 1, 1] : memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: linalg.map outs(%alloca_10 : memref<1x1x1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %38 = linalg.index 0 : index +# CHECK-NEXT: %39 = linalg.index 1 : index +# CHECK-NEXT: %40 = linalg.index 2 : index +# CHECK-NEXT: %41 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %dim = memref.dim %subview_12, %c1_13 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %dim_14 = memref.dim %subview_12, %c2 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %alloca_10[0, %3, %16, 0] [1, %dim, %dim_14, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_12, %subview_15 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>> +# CHECK-NEXT: scf.yield %alloca_10 : memref<1x1x1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_11 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %37, %subview_11 : memref<1x1x1x1xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: scf.yield %24 : memref<1x1x1x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %23, %subview_6 : memref<1x1x1x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: scf.yield %10 : memref<1x1x12x3xf32> +# CHECK-NEXT: } # CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { -# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) { -# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>) -# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: } {"./c"} -# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: } {"./w"} -# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> -# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %9, %subview_1 : memref<1x1x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x12x12x3xf32> # CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview = memref.subview %0[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> -# CHECK-NEXT: memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> # CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) { # CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { @@ -475,13 +866,13 @@ # CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> # CHECK-NEXT: } {"./h"} -# CHECK-NEXT: %subview_0 = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> +# CHECK-NEXT: %subview = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> # CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) { -# CHECK-NEXT: %3 = affine.apply #map(%arg3) -# CHECK-NEXT: %subview_1 = memref.subview %subview_0[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %3 = affine.apply #map9(%arg3) +# CHECK-NEXT: %subview_1 = memref.subview %subview[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_2) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { -# CHECK-NEXT: %5 = affine.apply #map(%arg5) +# CHECK-NEXT: %5 = affine.apply #map9(%arg5) # CHECK-NEXT: %subview_4 = memref.subview %subview_1[0, 0, %5, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> # CHECK-NEXT: %6 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_5) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { @@ -496,7 +887,7 @@ # CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { # CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> # CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> -# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: linalg.generic {indexing_maps = [#map10, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { # CHECK-NEXT: ^bb0(%in: f32, %in_16: f32, %out: f32): # CHECK-NEXT: %10 = arith.mulf %in, %in_16 : f32 # CHECK-NEXT: %11 = arith.addf %out, %10 : f32 diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py index e882104ab..c5e42160d 100644 --- a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py +++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py @@ -37,18 +37,22 @@ # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %1 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%0 : tensor<16x16xf32>) -> tensor<16x16xf32> -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32> -# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %padded = tensor.pad %arg0 nofold low[0, 0] high[2, 2] { +# CHECK-NEXT: ^bb0(%arg3: index, %arg4: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_A_pad_} : tensor<14x14xf32> to tensor<16x16xf32> +# CHECK-NEXT: %1 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %3 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32> -# CHECK-NEXT: %inserted_slice_1 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32> -# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %padded_1 = tensor.pad %arg1 nofold low[0, 0] high[2, 2] { +# CHECK-NEXT: ^bb0(%arg3: index, %arg4: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } {__xtc_id_B_pad_} : tensor<14x14xf32> to tensor<16x16xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %5 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%4 : tensor<16x16xf32>) -> tensor<16x16xf32> -# CHECK-NEXT: %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%inserted_slice, %inserted_slice_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32> -# CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: %3 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %4 = linalg.matmul {__xtc_id_matmul_padded_} ins(%padded, %padded_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%3 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %5 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> # CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } @@ -57,12 +61,12 @@ # CHECK-NEXT: transform.yield # CHECK-NEXT: } # CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { -# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op # CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op -# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op # CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) # CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op # CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -84,206 +88,437 @@ # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16_14 = arith.constant 16 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> -# CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_A_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32> -# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %c0_1 = arith.constant 0 : index # CHECK-NEXT: %c16_2 = arith.constant 16 : index # CHECK-NEXT: %c1_3 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16_14 = arith.constant 16 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> -# CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } {__xtc_id_B_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32> -# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> -# CHECK-NEXT: %cst_5 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_6 = arith.constant 0 : index -# CHECK-NEXT: %c16_7 = arith.constant 16 : index -# CHECK-NEXT: %c1_8 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16_14 = arith.constant 16 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: %6 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c16_6 = arith.constant 16 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16_13 = arith.constant 16 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x16xf32> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c16_10 = arith.constant 16 : index -# CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> -# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_15 = arith.constant 0 : index -# CHECK-NEXT: %c16_16 = arith.constant 16 : index -# CHECK-NEXT: %c1_17 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> -# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %c0_22 = arith.constant 0 : index -# CHECK-NEXT: %c16_23 = arith.constant 16 : index -# CHECK-NEXT: %c1_24 = arith.constant 1 : index -# CHECK-NEXT: %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) { -# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_29 : tensor<1x1xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c16_9 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c16_15 = arith.constant 16 : index +# CHECK-NEXT: %c1_16 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c16_21 = arith.constant 16 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_27 : tensor<1x1xf32> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x16xf32> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<16x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: %9 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> # CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { # CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %c0 = arith.constant 0 : index # CHECK-NEXT: %c16 = arith.constant 16 : index # CHECK-NEXT: %c1 = arith.constant 1 : index -# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16_14 = arith.constant 16 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> -# CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_A_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32> -# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> # CHECK-NEXT: %c0_1 = arith.constant 0 : index # CHECK-NEXT: %c16_2 = arith.constant 16 : index # CHECK-NEXT: %c1_3 = arith.constant 1 : index -# CHECK-NEXT: %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16_14 = arith.constant 16 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> -# CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } {__xtc_id_B_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32> -# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> -# CHECK-NEXT: %cst_5 = arith.constant 0.000000e+00 : f32 -# CHECK-NEXT: %c0_6 = arith.constant 0 : index -# CHECK-NEXT: %c16_7 = arith.constant 16 : index -# CHECK-NEXT: %c1_8 = arith.constant 1 : index -# CHECK-NEXT: %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_13 = arith.constant 0 : index -# CHECK-NEXT: %c16_14 = arith.constant 16 : index -# CHECK-NEXT: %c1_15 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x16xf32> +# CHECK-NEXT: %6 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c16_6 = arith.constant 16 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16_13 = arith.constant 16 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x16xf32> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<16x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %c0_9 = arith.constant 0 : index -# CHECK-NEXT: %c16_10 = arith.constant 16 : index -# CHECK-NEXT: %c1_11 = arith.constant 1 : index -# CHECK-NEXT: %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) { -# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> -# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %c0_15 = arith.constant 0 : index -# CHECK-NEXT: %c16_16 = arith.constant 16 : index -# CHECK-NEXT: %c1_17 = arith.constant 1 : index -# CHECK-NEXT: %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) { -# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> -# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> -# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %c0_22 = arith.constant 0 : index -# CHECK-NEXT: %c16_23 = arith.constant 16 : index -# CHECK-NEXT: %c1_24 = arith.constant 1 : index -# CHECK-NEXT: %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) { -# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> -# CHECK-NEXT: %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32> -# CHECK-NEXT: %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> -# CHECK-NEXT: scf.yield %inserted_slice_29 : tensor<1x1xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c16_9 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c16_15 = arith.constant 16 : index +# CHECK-NEXT: %c1_16 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c16_21 = arith.constant 16 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_27 : tensor<1x1xf32> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_25 : tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x16xf32> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> -# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<16x16xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %7 = tensor.empty() : tensor<14x14xf32> -# CHECK-NEXT: %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: %9 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> # CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: } # CHECK-NEXT: # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (14, d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14, 1)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 0, 1)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (1, d0 - d1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (-d0 + 1)> # CHECK-NEXT: module attributes {transform.with_named_sequence} { # CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { # CHECK-NEXT: %c1 = arith.constant 1 : index @@ -292,72 +527,170 @@ # CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 # CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> # CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> # CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = affine.min #map(%arg3) +# CHECK-NEXT: %5 = affine.min #map1(%4) +# CHECK-NEXT: %6 = arith.cmpi eq, %5, %c0 : index +# CHECK-NEXT: %7 = scf.if %6 -> (memref<1x16xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_1 : memref<1x16xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %8 = linalg.index 0 : index +# CHECK-NEXT: %9 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_1 : memref<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_7 = memref.subview %arg0[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref> +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %alloca_2 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32> +# CHECK-NEXT: %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_2) -> (memref<1x16xf32>) { +# CHECK-NEXT: %9 = affine.min #map2(%4) +# CHECK-NEXT: %10 = affine.min #map3(%5, %9) +# CHECK-NEXT: %11 = arith.cmpi eq, %10, %c0 : index +# CHECK-NEXT: %12 = affine.apply #map4(%10) +# CHECK-NEXT: %13 = affine.min #map(%arg5) +# CHECK-NEXT: %14 = affine.min #map1(%13) +# CHECK-NEXT: %15 = arith.cmpi eq, %14, %c0 : index +# CHECK-NEXT: %16 = arith.ori %15, %11 : i1 +# CHECK-NEXT: %17 = affine.apply #map4(%14) +# CHECK-NEXT: %18 = scf.if %16 -> (memref<1x1xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_9 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_9 : memref<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref> to memref> +# CHECK-NEXT: linalg.map outs(%alloca_10 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %dim = memref.dim %subview_12, %c0_13 : memref> +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %dim_15 = memref.dim %subview_12, %c1_14 : memref> +# CHECK-NEXT: %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref> +# CHECK-NEXT: memref.copy %subview_12, %subview_16 : memref> to memref> +# CHECK-NEXT: scf.yield %alloca_10 : memref<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: scf.yield %8 : memref<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview = memref.subview %0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> -# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_1) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_B_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_3) -> (memref<16x16xf32>) { +# CHECK-NEXT: %4 = affine.min #map(%arg3) +# CHECK-NEXT: %5 = affine.min #map1(%4) +# CHECK-NEXT: %6 = arith.cmpi eq, %5, %c0 : index +# CHECK-NEXT: %7 = scf.if %6 -> (memref<1x16xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_4 : memref<1x16xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %8 = linalg.index 0 : index +# CHECK-NEXT: %9 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_4 : memref<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_7 = memref.subview %arg1[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref> +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %alloca_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32> +# CHECK-NEXT: %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_5) -> (memref<1x16xf32>) { +# CHECK-NEXT: %9 = affine.min #map2(%4) +# CHECK-NEXT: %10 = affine.min #map3(%5, %9) +# CHECK-NEXT: %11 = arith.cmpi eq, %10, %c0 : index +# CHECK-NEXT: %12 = affine.apply #map4(%10) +# CHECK-NEXT: %13 = affine.min #map(%arg5) +# CHECK-NEXT: %14 = affine.min #map1(%13) +# CHECK-NEXT: %15 = arith.cmpi eq, %14, %c0 : index +# CHECK-NEXT: %16 = arith.ori %15, %11 : i1 +# CHECK-NEXT: %17 = affine.apply #map4(%14) +# CHECK-NEXT: %18 = scf.if %16 -> (memref<1x1xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_9 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_9 : memref<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref> to memref> +# CHECK-NEXT: linalg.map outs(%alloca_10 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %dim = memref.dim %subview_12, %c0_13 : memref> +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %dim_15 = memref.dim %subview_12, %c1_14 : memref> +# CHECK-NEXT: %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref> +# CHECK-NEXT: memref.copy %subview_12, %subview_16 : memref> to memref> +# CHECK-NEXT: scf.yield %alloca_10 : memref<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: scf.yield %8 : memref<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview_2 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>> # CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>) -# CHECK-NEXT: %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_6) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_8 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_7 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} # CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %2) -> (memref<16x16xf32>) { -# CHECK-NEXT: %subview_4 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_5) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_7 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_8) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) { -# CHECK-NEXT: %subview_10 = memref.subview %subview_4[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: %subview_11 = memref.subview %subview_7[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_10, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_6 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_7) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_9 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %subview_6[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>) # CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./k"} -# CHECK-NEXT: %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %5, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: } {"./j"} -# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> -# CHECK-NEXT: memref.copy %4, %subview_6 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_8 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> # CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> # CHECK-NEXT: } {"./i"} -# CHECK-NEXT: %subview_3 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> -# CHECK-NEXT: memref.copy %subview_3, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> +# CHECK-NEXT: %subview = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %subview, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> # CHECK-NEXT: return # CHECK-NEXT: } # CHECK-NEXT: }