diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index b4c9bfe3..8a761f87 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -20,6 +20,7 @@ from xtc.backends.mlir.MlirCompilerPasses import ( MlirProgramInsertTransformPass, MlirProgramApplyTransformPass, + apply_bufferization_passes, ) from xtc.backends.mlir.MlirTarget import ( @@ -149,6 +150,15 @@ def mlir_apply_transform_pass(self) -> None: if self._config.print_transformed_ir: self.dump_ir("IR Dump After transform") + def mlir_apply_tensor_lowering_pass(self) -> None: + if self._config.print_bufferization_ir: + self.dump_ir("IR Dump Before Tensor Lowering") + + apply_bufferization_passes(self._mlir_program) + + if self._config.print_bufferization_ir: + self.dump_ir("IR Dump After Tensor Lowering") + def _save_temp(self, fname: str, content: Any) -> None: if not self._config.save_temps: return @@ -196,4 +206,6 @@ def compile(self) -> None: self.mlir_apply_transform_pass() save_temp(mlir_atrn_dump_file, self._mlir_program.mlir_module) + self.mlir_apply_tensor_lowering_pass() + self._target.generate_code_for_target(self._mlir_program, dump_file=dump_file) diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index de33ff28..08835ade 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -534,3 +534,44 @@ def run(self) -> None: transform_op.erase() else: break + + +class MlirProgramApplyPasses: + def __init__( + self, + mlir_program: RawMlirProgram, + ) -> None: + self._mlir_program = mlir_program + + def run(self, pass_names: list[str]) -> None: + ctx = self._mlir_program.mlir_context + pm = PassManager(context=ctx) + for name in pass_names: + pm.add(name) # type: ignore # no attribute add + pm.run(self._mlir_program.mlir_module.operation) + + +def apply_bufferization_passes(mlir_program: RawMlirProgram): + apply_passes = MlirProgramApplyPasses(mlir_program) + bufferize_options = [ + "bufferize-function-boundaries=1", + "function-boundary-type-conversion=identity-layout-map", + "buffer-alignment=256", + ] + apply_passes.run( + [ + "canonicalize", + "cse", + "eliminate-empty-tensors", # causes ops to write directly to out buffer + f"one-shot-bufferize{{{' '.join(bufferize_options)}}}", + "func.func(buffer-hoisting)", + "func.func(buffer-loop-hoisting)", + "drop-equivalent-buffer-results", + "func.func(promote-buffers-to-stack)", + ] + ) + + +def pre_transform_tensor_passes(mlir_program: RawMlirProgram): + apply_passes = MlirProgramApplyPasses(mlir_program) + # apply_passes.run(["eliminate-empty-tensors"]) diff --git a/src/xtc/backends/mlir/MlirConfig.py b/src/xtc/backends/mlir/MlirConfig.py index 2d0ab512..65345681 100644 --- a/src/xtc/backends/mlir/MlirConfig.py +++ b/src/xtc/backends/mlir/MlirConfig.py @@ -22,6 +22,7 @@ class MlirConfig: print_assembly: bool = False visualize_jumps: bool = True print_lowered_ir: bool = False + print_bufferization_ir: bool = False debug: bool = False color: bool = False concluding_passes: list[str] = field(default_factory=list) diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index 191cad02..ab973fb6 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -2,12 +2,12 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024-2026 The XTC Project Authors # -from typing import cast, Any +from typing import cast, Any, Type from typing_extensions import override from xdsl.dialects.func import FuncOp as xdslFuncOp -from xdsl.dialects import func, memref -from xdsl.dialects.builtin import MemRefType, f32, f64 +from xdsl.dialects import func, memref, tensor, bufferization +from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64, UnitAttr from xdsl.ir import Region, Block, Operation from xdsl.builder import ImplicitBuilder @@ -28,7 +28,11 @@ def __init__( concluding_passes: list[str] = [], always_vectorize: bool = False, no_alias: bool = True, + use_tensor_dialect: bool = False, ): + self.xdsl_type: Type[TensorType] | Type[MemRefType] = ( + TensorType if use_tensor_dialect else MemRefType + ) if isinstance(xdsl_func, XTCGraph): assert nodes is None graph = xdsl_func @@ -62,13 +66,24 @@ def _init_from_xdsl( def _xdsl_generate_node( self, node: XTCNode, block: Block, variables: dict[str, Any] ): - operation = MlirOperation.from_operation(node.operation, name=node.name) + operation = MlirOperation.from_operation( + node.operation, + name=node.name, + op_type=self.xdsl_type, # type: ignore + ) names = [*node.inputs, *node.outputs] assert node.inputs_types is not None and node.outputs_types is not None types = [*node.inputs_types, *node.outputs_types] for name, type in zip(names, types): + if name in node.outputs and self.xdsl_type == TensorType: + with ImplicitBuilder(block): + variables[name] = tensor.EmptyOp( + dynamic_sizes=[], + tensor_type=self._xdsl_type_from_tensortype(type), + ).results[0] if name in variables: continue + assert self.xdsl_type != TensorType with ImplicitBuilder(block): elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) alloca = memref.AllocaOp.get( @@ -79,6 +94,11 @@ def _xdsl_generate_node( variables[name] = alloca.results[0] args = [variables[name] for name in names] _, attrs = operation.generate(block=block, args=args) + # the tensor dialect needs the result of the op, not the alloca + if self.xdsl_type == TensorType: + assert len(node.outputs) == len(attrs["output_nodes"]) + for name, output in zip(node.outputs, attrs["output_nodes"]): + variables[name] = output.results[0] return attrs def _init_from_graph( @@ -95,18 +115,34 @@ def _init_from_graph( ) params_types = [ self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type)) - for tensor_type in [*inputs_types, *outputs_types] + for tensor_type in inputs_types ] + # graph output types are always memrefs + params_types.extend( + self._memref_type_from_tensortype(cast(XTCTensorType, tensor_type)) + for tensor_type in outputs_types + ) inlined_block = Block(arg_types=params_types) variables = { name: arg for name, arg in zip([*graph.inputs, *graph.outputs], inlined_block.args) } block_attrs = [] + for node in graph.nodes.values(): node_attrs = self._xdsl_generate_node(node, inlined_block, variables) block_attrs.append(node_attrs) with ImplicitBuilder(inlined_block): + if self.xdsl_type == TensorType: + # write the final tensor values to the output buffers + for name, out_arg in zip( + graph.outputs, inlined_block.args[-len(graph.outputs) :] + ): + bufferization.MaterializeInDestinationOp( + operands=((variables[name],), (out_arg,)), + result_types=((),), + attributes={"writable": UnitAttr(), "restrict": UnitAttr()}, + ) func.ReturnOp() region = Region([inlined_block]) # type: ignore # issue with mypy payload = xdslFuncOp.from_region( @@ -128,6 +164,7 @@ def _init_from_graph( always_vectorize=always_vectorize, concluding_passes=concluding_passes, id=f"__xtc_id_{node_id}_", + xdsl_type=self.xdsl_type, ) return payload, nodes_dict @@ -136,11 +173,15 @@ def _xdsl_elt_shape_from_tensortype(self, type: XTCTensorType) -> tuple[Any, Any return (elt_type, type.constant_shape) def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any: + elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) + return self.xdsl_type(elt_type, shape) + + def _memref_type_from_tensortype(self, type: XTCTensorType) -> Any: elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) return MemRefType(elt_type, shape) def _np_types_spec( - self, types: list[MemRefType] + self, types: list[MemRefType] | list[TensorType] ) -> list[dict[str, tuple[int, ...] | str]]: types_map = {"f32": "float32", "f64": "float64"} types_spec: list[dict[str, tuple[int, ...] | str]] = [ @@ -156,12 +197,12 @@ def _np_types_spec( def np_inputs_spec(self) -> list[dict[str, Any]]: # Assume inputs are first, and output is single last param inputs_args_types = [arg.type for arg in self.xdsl_func.args[:-1]] - list_memref_tys = cast(list[MemRefType], inputs_args_types) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types) # type: ignore + return self._np_types_spec(list_xdsl_tys) @override def np_outputs_spec(self) -> list[dict[str, Any]]: # Assume inputs are first, and output is single last param outputs_args_types = [arg.type for arg in self.xdsl_func.args[-1:]] - list_memref_tys = cast(list[MemRefType], outputs_args_types) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[MemRefType], outputs_args_types) + return self._np_types_spec(list_xdsl_tys) diff --git a/src/xtc/backends/mlir/MlirNodeBackend.py b/src/xtc/backends/mlir/MlirNodeBackend.py index 135e99b8..f809e392 100644 --- a/src/xtc/backends/mlir/MlirNodeBackend.py +++ b/src/xtc/backends/mlir/MlirNodeBackend.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024-2026 The XTC Project Authors # -from typing import cast, Any +from typing import cast, Any, Type from typing_extensions import override from xdsl.ir import Operation as xdslOperation -from xdsl.dialects.builtin import MemRefType as xdslAnyMemRefType +from xdsl.dialects.builtin import MemRefType, TensorType from xdsl.dialects.builtin import UnitAttr as xdslUnitAttr from xtc.utils.xdsl_aux import xdsl_operator_to_function @@ -26,8 +26,10 @@ def __init__( always_vectorize: bool = False, no_alias: bool = True, id: str | None = None, + xdsl_type: Type[TensorType] | Type[MemRefType] = MemRefType, ): self._graph = None + self.xdsl_type = xdsl_type if id is None: self.op_id_attribute = f"__id{MlirNodeBackend.count}__" MlirNodeBackend.count += 1 @@ -48,7 +50,7 @@ def __init__( self.loop_stamps = loop_stamps def _np_types_spec( - self, types: list[xdslAnyMemRefType] + self, types: list[MemRefType | TensorType] ) -> list[dict[str, tuple[int, ...] | str]]: types_map = {"f32": "float32", "f64": "float64"} types_spec: list[dict[str, tuple[int, ...] | str]] = [ @@ -63,11 +65,11 @@ def _np_types_spec( @override def np_inputs_spec(self) -> list[dict[str, Any]]: list_attr_tys = [i.type for i in self.source_op.inputs] # type: ignore - list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) # type: ignore + return self._np_types_spec(list_xdsl_tys) @override def np_outputs_spec(self) -> list[dict[str, Any]]: list_attr_tys = [i.type for i in self.source_op.outputs] # type: ignore - list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys) - return self._np_types_spec(list_memref_tys) + list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys) # type: ignore + return self._np_types_spec(list_xdsl_tys) diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py index be687b44..48e0a2f1 100644 --- a/src/xtc/backends/mlir/MlirOps.py +++ b/src/xtc/backends/mlir/MlirOps.py @@ -7,9 +7,11 @@ from typing_extensions import override from typing import Any, Type, TypeAlias, cast -from xdsl.dialects import linalg, arith, builtin, memref +from xdsl.dialects import linalg, arith, builtin, memref, tensor from xdsl.dialects.builtin import ( MemRefType, + TensorType, + IndexType, f32, f64, i64, @@ -42,8 +44,9 @@ def __init__( args: tuple[Any, ...], attrs: dict[str, Any] = {}, name: str | None = None, + op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: - self.operator = operator(args, attrs, name=name) + self.operator = operator(args, attrs, name=name, op_type=op_type) self.args = args self.attrs = attrs self.name = self.operator.name if name is None else name @@ -78,7 +81,12 @@ def np_outputs_spec(self) -> list[dict[str, Any]]: return outputs_spec @classmethod - def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation": + def from_operation( + cls, + xtc_op: Operation, + name: str | None, + op_type: Type[MemRefType] | Type[TensorType], + ) -> "MlirOperation": dims = xtc_op.dims.values() dtype = xtc_op.inputs_types[0].dtype # TODO: currently get dtype from 1st arg args = tuple([*dims, dtype]) @@ -88,6 +96,7 @@ def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation": args, dict(attrs), name=name, + op_type=op_type, ) @@ -97,11 +106,16 @@ class MlirOperator(ABC): KINDS = "" def __init__( - self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None + self, + args: tuple[Any, ...], + attrs: dict[str, Any], + name: str | None = None, + op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: self.args = args self.attrs = {**attrs} self.name = name if name is not None else self.DEFAULT_NAME + self.op_type = op_type @abstractmethod def generate_op( @@ -149,23 +163,27 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] + self.op_type(elt_type, shape) + for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 3 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) + result = (args[2].type,) if self.op_type == TensorType else () fill = linalg.FillOp( - res=(), + res=result, inputs=(cst0.results[0],), outputs=(args[2],), ) reduce = linalg.MatmulOp( - res=(), + res=result, inputs=(args[0], args[1]), - outputs=(args[2],), + outputs=(fill.results[0],) + if self.op_type == TensorType + else (args[2],), ) fill_node_id = f"{self.name}_0" reduce_node_id = f"{self.name}" @@ -180,6 +198,7 @@ def generate_op( {"i": Ki, "j": Kj}, self.dims_sizes(), ], + "output_nodes": [reduce], } return block, attrs @@ -223,10 +242,14 @@ class MlirOperatorConv2D(MlirOperator): DEFAULT_STRIDE = (1, 1) def __init__( - self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None + self, + args: tuple[Any, ...], + attrs: dict[str, Any], + name: str | None = None, + op_type: Type[MemRefType] | Type[TensorType] = MemRefType, ) -> None: attrs = {"stride": self.DEFAULT_STRIDE, **attrs} - super().__init__(args, attrs, name) + super().__init__(args, attrs, name, op_type) @override def dims(self, kind: str = "") -> tuple[str, ...]: @@ -250,16 +273,17 @@ def generate_op( elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) for shape in [*inps_dims, out_dims] + self.op_type(elt_type, shape) for shape in [*inps_dims, out_dims] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 3 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) with ImplicitBuilder(block): + result = (args[2].type,) if self.op_type == TensorType else () cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) fill = linalg.FillOp( - res=(), + res=result, inputs=(cst0.results[0],), outputs=(args[2],), ) @@ -282,7 +306,9 @@ def generate_op( linalg.YieldOp(add) reduce = linalg.GenericOp( inputs=(args[0], args[1]), - outputs=(args[2],), + outputs=(fill.results[0],) + if self.op_type == TensorType + else (args[2],), body=Region([block_in]), # type: ignore # mypy issue with dataclass # ignore typing due to xdsl hints limitation indexing_maps=[ @@ -306,6 +332,7 @@ def generate_op( ), ], iterator_types=iterator_types, + result_types=result, ) fill_node_id = f"{self.name}_0" reduce_node_id = f"{self.name}" @@ -320,6 +347,7 @@ def generate_op( {"b": Kb, "h": Kh, "w": Kw, "f": Kf}, self.dims_sizes(), ], + "output_nodes": [reduce], } return block, attrs @@ -367,13 +395,14 @@ def generate_op( elt_type = {"float32": f32, "float64": f64}[dtype] elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: - ops_types = [MemRefType(elt_type, shape) for shape in [[Ki], [Ki]]] + ops_types = [self.op_type(elt_type, shape) for shape in [[Ki], [Ki]]] block = Block(arg_types=ops_types) args = block.args assert len(args) == 2 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) inp_shape, out_shape = [ - list(cast(MemRefType, arg.type).get_shape()) for arg in args + list(cast(self.op_type, arg.type).get_shape()) # type: ignore + for arg in args ] inp_size, out_size = [mulall(shape) for shape in [inp_shape, out_shape]] assert inp_size == out_size @@ -392,50 +421,58 @@ def generate_op( ) ] ) - inp = memref.CollapseShapeOp( - operands=[args[0]], - properties=dict(reassociation=inp_reassociation), - result_types=[MemRefType(elt_type, (inp_size,))], - ) - out = memref.CollapseShapeOp( - operands=[args[1]], - properties=dict(reassociation=out_reassociation), - result_types=[MemRefType(elt_type, (out_size,))], - ) + if self.op_type == TensorType: + out_operand = args[1] + inp_operand = args[0] + rank = len(out_shape) + iterator_types = [StringAttr("parallel")] * rank + indexing_maps = [ + AffineMapAttr(AffineMap.identity(rank)), # input + AffineMapAttr( + AffineMap.identity(rank).drop_results(out_shape) + ), # scalar + AffineMapAttr(AffineMap.identity(rank)), # output + ] + else: + inp = memref.CollapseShapeOp( # type: ignore + operands=[args[0]], + properties=dict(reassociation=inp_reassociation), + result_types=[self.op_type(elt_type, (inp_size,))], + ) + inp_operand = inp.results[0] # type: ignore + out = memref.CollapseShapeOp( + operands=[args[1]], + properties=dict(reassociation=out_reassociation), + result_types=[self.op_type(elt_type, (out_size,))], + ) + out_operand = out.results[0] # type: ignore + iterator_types = [ + StringAttr({"P": "parallel", "R": "reduction"}[k]) + for k in self.KINDS + ] + # ignore typing due to xdsl hints limitation + indexing_maps = [ + AffineMapAttr(AffineMap.from_callable(lambda i: (i,))), # type: ignore + AffineMapAttr(AffineMap.from_callable(lambda _: ())), # type: ignore + AffineMapAttr(AffineMap.from_callable(lambda i: (i,))), # type: ignore + ] + iterator_types = [ + StringAttr({"P": "parallel", "R": "reduction"}[k]) + for k in self.KINDS + ] + result = (args[1].type,) if self.op_type == TensorType else () cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size)) - iterator_types = [ - StringAttr({"P": "parallel", "R": "reduction"}[k]) for k in self.KINDS - ] block_in = Block(arg_types=[f32, f32, f32]) with ImplicitBuilder(block_in): max = arith.MaximumfOp(block_in.args[0], block_in.args[1]) linalg.YieldOp(max) relu = linalg.GenericOp( - inputs=(inp.results[0], cst0.results[0]), - outputs=(out.results[0],), + inputs=(inp_operand, cst0.results[0]), + outputs=(out_operand,), body=Region([block_in]), # type: ignore # mypy issue with dataclass - # ignore typing due to xdsl hints limitation - indexing_maps=[ - AffineMapAttr( - AffineMap.from_callable( - lambda i: # type: ignore - (i,) - ) - ), - AffineMapAttr( - AffineMap.from_callable( - lambda _: # type: ignore - () - ) - ), - AffineMapAttr( - AffineMap.from_callable( - lambda i: # type: ignore - (i,) - ) - ), - ], + indexing_maps=indexing_maps, iterator_types=iterator_types, + result_types=result, ) relu_node_id = f"{self.name}" relu.attributes[f"__xtc_id_{relu_node_id}_"] = UnitAttr() @@ -446,6 +483,7 @@ def generate_op( "dims_sizes": [ self.dims_sizes(), ], + "output_nodes": [relu], } return block, attrs @@ -492,25 +530,31 @@ def generate_op( dims_value = list(self.args[:-1]) padding = self.attrs["padding"] constant_value = self.attrs["constant_value"] + lows = [0] * len(dims_value) + highs = [0] * len(dims_value) if isinstance(padding, dict): dims_value_before_pad = list(dims_value) for i, pad_value in padding.items(): dims_value_before_pad[i] -= sum(pad_value) + lows[i] = pad_value[0] + highs[i] = pad_value[1] else: dims_value_before_pad = [ dim_value - sum(padding) for dim_value in dims_value ] + lows = [padding[0] for d in dims_value] + highs = [padding[1] for d in dims_value] elt_type = {"float32": f32, "float64": f64}[dtype] elt_size = {"float32": 32, "float64": 64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) + MemRefType(elt_type, shape) # should be op_type here?? for shape in [dims_value_before_pad, dims_value] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 2 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) if isinstance(padding, dict): offsets = [0 for _ in self.args[:-1]] for i, (pad_b, pad_a) in padding.items(): @@ -519,38 +563,57 @@ def generate_op( offsets = [padding[0] for _ in self.args[:-1]] sizes = list(dims_value_before_pad) strides = [1 for _ in self.args[:-1]] + using_tensors = self.op_type == TensorType with ImplicitBuilder(block): cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size)) - fill = linalg.FillOp( - res=(), - inputs=(cst0.results[0],), - outputs=(args[1],), - ) - subview = memref.SubviewOp.from_static_parameters( - source=args[1], - source_type=args[1].type, # type: ignore - offsets=offsets, - sizes=sizes, - strides=strides, - ) - copy = linalg.CopyOp( - inputs=[args[0]], - outputs=[subview.result], - res=(), - ) - fill_node_id = f"{self.name}_0" - fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() + result = (args[1].type,) if using_tensors else () + fill_node_id = f"{self.name}_0" + if using_tensors: + fill = None + block_in = Block(arg_types=[IndexType()] * len(dims_value)) + with ImplicitBuilder(block_in): + tensor.YieldOp(cst0) + copy = tensor.PadOp( + source=args[0], + region=Region([block_in]), + low=[], + high=[], + nofold=UnitAttr(), + result_type=TensorType(elt_type, dims_value), + static_low=lows, + static_high=highs, + ) + else: + fill = linalg.FillOp( + res=result, + inputs=(cst0.results[0],), + outputs=(args[1],), + ) + subview = memref.SubviewOp.from_static_parameters( + source=args[1], + source_type=args[1].type, # type: ignore + offsets=offsets, + sizes=sizes, + strides=strides, + ) + copy = linalg.CopyOp( # type: ignore + inputs=[args[0]], + outputs=[subview.result], + res=result, + ) + fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr() copy_node_id = f"{self.name}" copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr() attrs = { "nodes_map": { - fill_node_id: fill, + **({fill_node_id: fill} if fill else {}), copy_node_id: copy, }, "dims_sizes": [ self.dims_sizes(), - self.dims_sizes(), + *([] if using_tensors else [self.dims_sizes()]), ], + "output_nodes": [copy], } return block, attrs @@ -618,13 +681,13 @@ def generate_op( elt_type = {"float32": f32, "float64": f64}[dtype] if block is None: ops_types = [ - MemRefType(elt_type, shape) + self.op_type(elt_type, shape) for shape in [dims_values_before_unpad, dims_values] ] block = Block(arg_types=ops_types) args = block.args assert len(args) == 2 - assert all(isinstance(arg.type, MemRefType) for arg in args) + assert all(isinstance(arg.type, self.op_type) for arg in args) if isinstance(padding, dict): offsets = [0 for _ in self.args[:-1]] for i, (pad_b, _) in padding.items(): @@ -633,28 +696,36 @@ def generate_op( offsets = [padding[0] for _ in self.args[:-1]] sizes = dims_values strides = [1 for _ in self.args[:-1]] + using_tensors = self.op_type == TensorType with ImplicitBuilder(block): - subview = memref.SubviewOp.from_static_parameters( - source=args[0], - source_type=args[0].type, # type: ignore - offsets=offsets, - sizes=sizes, - strides=strides, - ) - copy = linalg.CopyOp( - inputs=[subview.result], - outputs=[args[1]], - res=(), - ) + if using_tensors: + copy = tensor.ExtractSliceOp.from_static_parameters( + source=args[0], + offsets=offsets, + sizes=sizes, + strides=strides, + ) + else: + subview = memref.SubviewOp.from_static_parameters( + source=args[0], + source_type=args[0].type, # type: ignore + offsets=offsets, + sizes=sizes, + strides=strides, + ) + copy = linalg.CopyOp( # type: ignore + inputs=[subview.result], + outputs=[args[1]], + res=(), + ) copy_node_id = f"{self.name}" copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr() attrs = { "nodes_map": { - copy_node_id: copy, + copy_node_id: None if using_tensors else copy, }, - "dims_sizes": [ - self.dims_sizes(), - ], + "dims_sizes": [*([] if using_tensors else [self.dims_sizes()])], + "output_nodes": [copy], } return block, attrs diff --git a/src/xtc/utils/xdsl_aux.py b/src/xtc/utils/xdsl_aux.py index c339f02d..0061ccba 100644 --- a/src/xtc/utils/xdsl_aux.py +++ b/src/xtc/utils/xdsl_aux.py @@ -12,6 +12,7 @@ from xdsl.dialects.arith import ConstantOp from xdsl.dialects.builtin import ( MemRefType, + TensorType, IntegerAttr, FloatAttr, IntegerType, @@ -19,7 +20,7 @@ from xdsl.context import Context from xdsl.parser import Parser -from xdsl.dialects import func, linalg, arith, memref +from xdsl.dialects import func, linalg, arith, memref, tensor from xdsl.dialects.builtin import ModuleOp @@ -29,6 +30,7 @@ def parse_xdsl_module(source: str) -> ModuleOp: context.load_dialect(linalg.Linalg) context.load_dialect(arith.Arith) context.load_dialect(memref.MemRef) + context.load_dialect(tensor.Tensor) parser = Parser(context, source) module = parser.parse_module() return module @@ -39,7 +41,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp: operands = source_op.operands shaped_types, scalar_types = [], [] for o in operands: - if isa(o.type, MemRefType): + if isa(o.type, MemRefType) or isa(o.type, TensorType): shaped_types.append(o.type) else: scalar_types.append(o.type) @@ -49,7 +51,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp: concrete_operands = [] shaped_count, scalar_count = 0, 0 for o in operands: - if isa(o.type, MemRefType): + if isa(o.type, MemRefType) or isa(o.type, TensorType): concrete_operands.append(payload.args[shaped_count]) shaped_count += 1 else: diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py new file mode 100644 index 00000000..a6791aaa --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py @@ -0,0 +1,418 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +# Small conv2d +N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 3, 3, 3, 1, 1, "float32" +a = O.tensor((N, H + R - 1, W + S - 1, C), dtype, name="I") +b = O.tensor((R, S, C, F), dtype, name="W") + +with O.graph(name="conv2d_nhwc_mini") as gb: + O.conv2d(a, b, stride=(SH, SW), name="O") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="conv2d_nhwc_mini_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x8x8x16xf32>) -> tensor<1x8x8x16xf32> +# CHECK-NEXT: %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x10x10x3xf32>, tensor<3x3x3x16xf32>) outs(%1 : tensor<1x8x8x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): +# CHECK-NEXT: %3 = arith.mulf %in, %in_0 : f32 +# CHECK-NEXT: %4 = arith.addf %out, %3 : f32 +# CHECK-NEXT: linalg.yield %4 : f32 +# CHECK-NEXT: } -> tensor<1x8x8x16xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./f" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./f" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./c" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> +# CHECK-NEXT: %c0_7 = arith.constant 0 : index +# CHECK-NEXT: %c8_8 = arith.constant 8 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c8_12 = arith.constant 8 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_25 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_30 = arith.constant 0 : index +# CHECK-NEXT: %c3_31 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %c3_38 = arith.constant 3 : index +# CHECK-NEXT: %c1_39 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_45: f32, %out: f32): +# CHECK-NEXT: %10 = arith.mulf %in, %in_45 : f32 +# CHECK-NEXT: %11 = arith.addf %out, %10 : f32 +# CHECK-NEXT: linalg.yield %11 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x8x8x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> +# CHECK-NEXT: %c0_7 = arith.constant 0 : index +# CHECK-NEXT: %c8_8 = arith.constant 8 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32> +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c8_12 = arith.constant 8 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_18 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_25 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_30 = arith.constant 0 : index +# CHECK-NEXT: %c3_31 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %c3_38 = arith.constant 3 : index +# CHECK-NEXT: %c1_39 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_45: f32, %out: f32): +# CHECK-NEXT: %10 = arith.mulf %in, %in_45 : f32 +# CHECK-NEXT: %11 = arith.addf %out, %10 : f32 +# CHECK-NEXT: linalg.yield %11 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x8x8x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_3 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_3 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) +# CHECK-NEXT: %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_3, %subview_4 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_0 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[0, %arg3, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %subview[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg1[0, 0, 0, %arg7] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %subview_6) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %subview_2[0, %arg9, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_5[%arg9, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_10 = memref.subview %subview_8[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_9[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %subview_10[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_12, %subview_13 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_14: f32, %out: f32): +# CHECK-NEXT: %7 = arith.mulf %in, %in_14 : f32 +# CHECK-NEXT: %8 = arith.addf %out, %7 : f32 +# CHECK-NEXT: linalg.yield %8 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: scf.yield %6 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %5 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_7 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_4 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x8x8x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: memref.copy %1, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: conv2d_nhwc_mini +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 1x10x10x3xfloat32 +# CHECK-NEXT: - %1 : 3x3x3x16xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 1x8x8x16xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: conv2d(%0, %1, stride=(1, 1)) {name = 'O'} : [1x10x10x3xfloat32, 3x3x3x16xfloat32] -> [1x8x8x16xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py new file mode 100644 index 00000000..a363b4e1 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py @@ -0,0 +1,708 @@ +# RUN: python %s 2>&1 | filecheck %s + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend +from xtc.artifacts import get_operation +from xtc.artifacts import get_operation + +op = get_operation("conv2d", "ResNet18_01") +N, H, W, F, R, S, C = [op["dims"][k] for k in ["n", "h", "w", "f", "r", "s", "c"]] +SH, SW = [op["params"][k] for k in ["SH", "SW"]] +dtype = "float32" + +a = O.tensor((N, H + R - 1, W + S - 1, C), dtype) +b = O.tensor((R, S, C, F), dtype) + +with O.graph(name="conv2d_nhwc_r181") as gb: + O.conv2d(a, b, stride=(SH, SW), name="O") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sch.tile("w", {"w1": 4}) +sch.tile("f", {"f1": 16}) +sch.interchange(["b", "h", "w", "f", "r", "s", "c", "w1", "f1"]) +sch.vectorize(["f1"]) +sch.unroll({"w1": 4, "c": 3}) +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="conv2d_nhwc_r181_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x112x112x64xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32> +# CHECK-NEXT: %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%1 : tensor<1x112x112x64xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_0: f32, %out: f32): +# CHECK-NEXT: %3 = arith.mulf %in, %in_0 : f32 +# CHECK-NEXT: %4 = arith.addf %out, %3 : f32 +# CHECK-NEXT: linalg.yield %4 : f32 +# CHECK-NEXT: } -> tensor<1x112x112x64xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./f" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./f" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./c" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_21 "./w1" : !transform.any_op +# CHECK-NEXT: transform.include @_vecto failures(suppress) (%tiled_linalg_op_20) : (!transform.any_op) -> () +# CHECK-NEXT: transform.loop.unroll %loops_21 {factor = 4 : i64} : !transform.any_op +# CHECK-NEXT: transform.loop.unroll %loops_19 {factor = 3 : i64} : !transform.any_op +# CHECK-NEXT: %2 = transform.get_parent_op %loops_7 {isolated_from_above} : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.reduction_to_contract +# CHECK-NEXT: transform.apply_patterns.vector.transfer_permutation_patterns +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.lower_outerproduct +# CHECK-NEXT: transform.apply_patterns.vector.lower_contraction +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %c6 = arith.constant 6 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c7 = arith.constant 7 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c112 = arith.constant 112 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x112x112x64xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32> +# CHECK-NEXT: %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32> +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32> +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg7) +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32> +# CHECK-NEXT: %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32> +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32> +# CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %9 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %c6 = arith.constant 6 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c7 = arith.constant 7 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c112 = arith.constant 112 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x112x112x64xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32> +# CHECK-NEXT: %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32> +# CHECK-NEXT: %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32> +# CHECK-NEXT: %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) { +# CHECK-NEXT: %6 = affine.apply #map(%arg7) +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32> +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32> +# CHECK-NEXT: %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32> +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32> +# CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32> +# CHECK-NEXT: %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_56: f32, %out: f32): +# CHECK-NEXT: %22 = arith.mulf %in, %in_56 : f32 +# CHECK-NEXT: %23 = arith.addf %out, %22 : f32 +# CHECK-NEXT: linalg.yield %23 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x16xf32> +# CHECK-NEXT: %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %9 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x112x112x64xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @conv2d_nhwc_r181(%arg0: memref<1x230x230x3xf32> {llvm.noalias}, %arg1: memref<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) { +# CHECK-NEXT: %c7 = arith.constant 7 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c112 = arith.constant 112 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) { +# CHECK-NEXT: %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c64 step %c1 iter_args(%arg8 = %subview_2) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>) +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_4, %subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_3 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview = memref.subview %arg0[0, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) { +# CHECK-NEXT: %2 = affine.apply #map(%arg3) +# CHECK-NEXT: %subview_0 = memref.subview %subview[0, %2, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c112 step %c4 iter_args(%arg6 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %4 = affine.apply #map(%arg5) +# CHECK-NEXT: %subview_3 = memref.subview %subview_0[0, 0, %4, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c64 step %c16 iter_args(%arg8 = %subview_4) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg1[0, 0, 0, %arg7] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %subview_7) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_9 = memref.subview %subview_3[0, %arg9, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_6[%arg9, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) { +# CHECK-NEXT: %subview_11 = memref.subview %subview_9[0, 0, %arg11, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_10[0, %arg11, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, 0, 0, 0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_15, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_16 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_17 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_16, %subview_17 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_18 = memref.subview %subview_13[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_19 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_18, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_20 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_19, %subview_20 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_21 = memref.subview %subview_13[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_21, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_23 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_22, %subview_23 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_24 = memref.subview %subview_13[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_25 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_24, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_26 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_25, %subview_26 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_27 = memref.subview %subview_11[0, 0, 0, 1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_28 = memref.subview %subview_12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_27[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_29, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_30 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_31 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_30, %subview_31 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_32 = memref.subview %subview_27[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_33 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_32, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_34 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_33, %subview_34 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_35 = memref.subview %subview_27[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_36 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_35, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_37 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_36, %subview_37 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_38 = memref.subview %subview_27[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_39 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_38, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_40 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_39, %subview_40 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_41 = memref.subview %subview_11[0, 0, 0, 2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_42 = memref.subview %subview_12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_43 = memref.subview %subview_41[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_44 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_45 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_44, %subview_45 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_46 = memref.subview %subview_41[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_47 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_48 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_47, %subview_48 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_49 = memref.subview %subview_41[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_50 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_49, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_51 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_50, %subview_51 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: %subview_52 = memref.subview %subview_41[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_53 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_52, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs = {__xtc_id_O_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_55: f32, %out: f32): +# CHECK-NEXT: %8 = arith.mulf %in, %in_55 : f32 +# CHECK-NEXT: %9 = arith.addf %out, %8 : f32 +# CHECK-NEXT: linalg.yield %9 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_54 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_53, %subview_54 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg12 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %7 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %6, %subview_8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_5 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x112x112x64xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: memref.copy %1, %arg2 : memref<1x112x112x64xf32> to memref<1x112x112x64xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: conv2d_nhwc_r181 +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 1x230x230x3xfloat32 +# CHECK-NEXT: - %1 : 7x7x3x64xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 1x112x112x64xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: conv2d(%0, %1, stride=(2, 2)) {name = 'O'} : [1x230x230x3xfloat32, 7x7x3x64xfloat32] -> [1x112x112x64xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py new file mode 100644 index 00000000..dd676fa2 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py @@ -0,0 +1,238 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="matmul_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %2 = linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_3 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_3 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_6 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_12 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %arg1[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg7 = %c0 to %c512 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_2[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %2, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: memref.copy %1, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 4x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py new file mode 100644 index 00000000..b8352285 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py @@ -0,0 +1,377 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul_relu") as gb: + m = O.matmul(a, b, name="matmul") + O.relu(m, name="relu") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler(default_node="matmul") +sch.tile("i", {"i1": 2}) +sch.tile("j", {"j1": 16}) +sch.interchange(["k", "i", "j", "i1", "j1"]) +sch.vectorize(["j1"]) +sch.unroll({"i1": 2}) +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="matmul_relu_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %cst_0 : tensor<4x32xf32>, f32) outs(%3 : tensor<4x32xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): +# CHECK-NEXT: %5 = arith.maximumf %in, %in_1 : f32 +# CHECK-NEXT: linalg.yield %5 : f32 +# CHECK-NEXT: } -> tensor<4x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_matmul_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./k" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i1" : !transform.any_op +# CHECK-NEXT: transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> () +# CHECK-NEXT: transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op +# CHECK-NEXT: %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.reduction_to_contract +# CHECK-NEXT: transform.apply_patterns.vector.transfer_permutation_patterns +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %2 { +# CHECK-NEXT: transform.apply_patterns.vector.lower_outerproduct +# CHECK-NEXT: transform.apply_patterns.vector.lower_contraction +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./i" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> +# CHECK-NEXT: %0 = ub.poison : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> +# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %13 = vector.broadcast %12 : f32 to vector<16xf32> +# CHECK-NEXT: %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %15 = vector.fma %13, %11, %14 : vector<16xf32> +# CHECK-NEXT: %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %23 = vector.broadcast %22 : f32 to vector<16xf32> +# CHECK-NEXT: %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %25 = vector.fma %23, %21, %24 : vector<16xf32> +# CHECK-NEXT: %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<2x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.yield %6 : tensor<4x32xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %4 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c4_2 = arith.constant 4 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_5: f32, %out: f32): +# CHECK-NEXT: %7 = arith.maximumf %in, %in_5 : f32 +# CHECK-NEXT: linalg.yield %7 : f32 +# CHECK-NEXT: } -> tensor<1x32xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32> +# CHECK-NEXT: %0 = ub.poison : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_5 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32> +# CHECK-NEXT: %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32> +# CHECK-NEXT: %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) { +# CHECK-NEXT: %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %13 = vector.broadcast %12 : f32 to vector<16xf32> +# CHECK-NEXT: %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %15 = vector.fma %13, %11, %14 : vector<16xf32> +# CHECK-NEXT: %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> +# CHECK-NEXT: %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32> +# CHECK-NEXT: %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %23 = vector.broadcast %22 : f32 to vector<16xf32> +# CHECK-NEXT: %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %25 = vector.fma %23, %21, %24 : vector<16xf32> +# CHECK-NEXT: %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32> +# CHECK-NEXT: %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32> +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32> +# CHECK-NEXT: %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_15 : tensor<2x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.yield %6 : tensor<4x32xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %4 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c4_2 = arith.constant 4 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_5: f32, %out: f32): +# CHECK-NEXT: %7 = arith.maximumf %in, %in_5 : f32 +# CHECK-NEXT: linalg.yield %7 : f32 +# CHECK-NEXT: } -> tensor<1x32xf32> +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = ub.poison : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %alloca) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %1) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview_1 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_2) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_4 = memref.subview %subview_0[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_1[0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %6 = vector.transfer_read %subview_6[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> +# CHECK-NEXT: %7 = vector.transfer_read %subview_4[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %8 = vector.transfer_read %subview_7[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %9 = vector.extract %7[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %10 = vector.extract %6[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %11 = vector.broadcast %10 : f32 to vector<16xf32> +# CHECK-NEXT: %12 = vector.extract %8[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %13 = vector.fma %11, %9, %12 : vector<16xf32> +# CHECK-NEXT: %14 = vector.broadcast %13 : vector<16xf32> to vector<1x16xf32> +# CHECK-NEXT: vector.transfer_write %14, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_7, %subview_8 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_1[1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %15 = vector.transfer_read %subview_9[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32> +# CHECK-NEXT: %16 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32> +# CHECK-NEXT: %17 = vector.extract %15[0, 0] : f32 from vector<1x1xf32> +# CHECK-NEXT: %18 = vector.broadcast %17 : f32 to vector<16xf32> +# CHECK-NEXT: %19 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32> +# CHECK-NEXT: %20 = vector.fma %18, %9, %19 : vector<16xf32> +# CHECK-NEXT: %21 = vector.broadcast %20 : vector<16xf32> to vector<1x16xf32> +# CHECK-NEXT: vector.transfer_write %21, %subview_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_10, %subview_11 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_12 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_3 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_3 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.yield %4 : memref<4x32xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%subview, %cst : memref<1x32xf32, strided<[32, 1], offset: ?>>, f32) outs(%subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>>) attrs = {__xtc_id_relu_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_2: f32, %out: f32): +# CHECK-NEXT: %4 = arith.maximumf %in, %in_2 : f32 +# CHECK-NEXT: linalg.yield %4 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_0, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: memref.copy %3, %arg2 : memref<4x32xf32> to memref<4x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul_relu +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %3 : 4x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'matmul'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: - %3: relu(%2) {name = 'relu'} : [4x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py new file mode 100644 index 00000000..2ebcefb3 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py @@ -0,0 +1,930 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +# Small conv2d +N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 5, 5, 3, 2, 2, "float32" +a = O.tensor((N, H, W, C), dtype, name="I") +b = O.tensor((R, S, C, F), dtype, name="W") + +with O.graph(name="pad_conv2d_nhwc_mini") as gb: + p = O.pad2d(a, padding=2, axes=(1, 2), name="pad") + O.conv2d(p, b, stride=(SH, SW), name="conv") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler() +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="pad_conv2d_nhwc_mini_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %padded = tensor.pad %arg0 nofold low[0, 2, 2, 0] high[0, 2, 2, 0] { +# CHECK-NEXT: ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_pad_} : tensor<1x8x8x3xf32> to tensor<1x12x12x3xf32> +# CHECK-NEXT: %1 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %2 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%1 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> +# CHECK-NEXT: %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%padded, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%2 : tensor<1x4x4x16xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_1: f32, %out: f32): +# CHECK-NEXT: %4 = arith.mulf %in, %in_1 : f32 +# CHECK-NEXT: %5 = arith.addf %out, %4 : f32 +# CHECK-NEXT: linalg.yield %5 : f32 +# CHECK-NEXT: } -> tensor<1x4x4x16xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./c" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./f" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./b" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./h" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_19 "./w" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_21 "./f" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_23 "./r" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_25 "./s" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_27 "./c" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)> +# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)> +# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)> +# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)> +# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)> +# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)> +# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)> +# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)> +# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %6 = arith.cmpi eq, %c8, %c0_8 : index +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c8_10 = arith.constant 8 : index +# CHECK-NEXT: %7 = arith.cmpi eq, %c8_10, %c0_9 : index +# CHECK-NEXT: %8 = arith.ori %7, %6 : i1 +# CHECK-NEXT: %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x12x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32> +# CHECK-NEXT: %10 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %12 = affine.apply #map(%arg5) +# CHECK-NEXT: %13 = affine.max #map1(%arg5) +# CHECK-NEXT: %14 = affine.apply #map2(%arg5) +# CHECK-NEXT: %15 = affine.max #map3(%arg5) +# CHECK-NEXT: %16 = affine.min #map4(%15) +# CHECK-NEXT: %17 = affine.apply #map5(%13) +# CHECK-NEXT: %18 = affine.apply #map6(%16) +# CHECK-NEXT: %19 = affine.min #map7(%16, %13) +# CHECK-NEXT: %20 = affine.max #map8(%19) +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %21 = arith.cmpi eq, %20, %c0_13 : index +# CHECK-NEXT: %22 = affine.apply #map5(%20) +# CHECK-NEXT: %23 = affine.apply #map9(%13, %20) +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c8_15 = arith.constant 8 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %c8_15, %c0_14 : index +# CHECK-NEXT: %25 = arith.ori %24, %21 : i1 +# CHECK-NEXT: %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32> +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %27 = tensor.empty() : tensor<1x1x12x3xf32> +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c12_21 = arith.constant 12 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %29 = affine.max #map10(%13) +# CHECK-NEXT: %30 = affine.apply #map11(%13) +# CHECK-NEXT: %31 = affine.max #map12(%13) +# CHECK-NEXT: %32 = affine.min #map13(%31, %20) +# CHECK-NEXT: %33 = affine.apply #map5(%29) +# CHECK-NEXT: %34 = affine.apply #map14(%20, %32) +# CHECK-NEXT: %35 = affine.min #map15(%20, %32, %29) +# CHECK-NEXT: %36 = affine.max #map8(%35) +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %37 = arith.cmpi eq, %36, %c0_24 : index +# CHECK-NEXT: %38 = affine.apply #map5(%36) +# CHECK-NEXT: %39 = affine.apply #map9(%29, %36) +# CHECK-NEXT: %40 = affine.apply #map(%arg7) +# CHECK-NEXT: %41 = affine.max #map1(%arg7) +# CHECK-NEXT: %42 = affine.apply #map2(%arg7) +# CHECK-NEXT: %43 = affine.max #map3(%arg7) +# CHECK-NEXT: %44 = affine.min #map4(%43) +# CHECK-NEXT: %45 = affine.apply #map5(%41) +# CHECK-NEXT: %46 = affine.apply #map6(%44) +# CHECK-NEXT: %47 = affine.min #map7(%44, %41) +# CHECK-NEXT: %48 = affine.max #map8(%47) +# CHECK-NEXT: %c0_25 = arith.constant 0 : index +# CHECK-NEXT: %49 = arith.cmpi eq, %48, %c0_25 : index +# CHECK-NEXT: %50 = arith.ori %49, %37 : i1 +# CHECK-NEXT: %51 = affine.apply #map5(%48) +# CHECK-NEXT: %52 = affine.apply #map9(%41, %48) +# CHECK-NEXT: %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32> +# CHECK-NEXT: %c1_28 = arith.constant 1 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %54 = tensor.empty() : tensor<1x1x1x3xf32> +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %c2_30 = arith.constant 2 : index +# CHECK-NEXT: %c0_31 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %c1_34 = arith.constant 1 : index +# CHECK-NEXT: %56 = affine.max #map10(%29) +# CHECK-NEXT: %57 = affine.apply #map11(%29) +# CHECK-NEXT: %58 = affine.max #map12(%29) +# CHECK-NEXT: %59 = affine.min #map13(%58, %36) +# CHECK-NEXT: %60 = affine.apply #map5(%56) +# CHECK-NEXT: %61 = affine.apply #map14(%36, %59) +# CHECK-NEXT: %62 = affine.min #map15(%36, %59, %56) +# CHECK-NEXT: %63 = affine.max #map8(%62) +# CHECK-NEXT: %c0_35 = arith.constant 0 : index +# CHECK-NEXT: %64 = arith.cmpi eq, %63, %c0_35 : index +# CHECK-NEXT: %65 = affine.apply #map5(%63) +# CHECK-NEXT: %66 = affine.apply #map9(%56, %63) +# CHECK-NEXT: %c2_36 = arith.constant 2 : index +# CHECK-NEXT: %67 = affine.max #map10(%41) +# CHECK-NEXT: %68 = affine.apply #map11(%41) +# CHECK-NEXT: %69 = affine.max #map12(%41) +# CHECK-NEXT: %70 = affine.min #map13(%69, %48) +# CHECK-NEXT: %71 = affine.apply #map5(%67) +# CHECK-NEXT: %72 = affine.apply #map14(%48, %70) +# CHECK-NEXT: %73 = affine.min #map15(%48, %70, %67) +# CHECK-NEXT: %74 = affine.max #map8(%73) +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %75 = arith.cmpi eq, %74, %c0_37 : index +# CHECK-NEXT: %76 = arith.ori %75, %64 : i1 +# CHECK-NEXT: %77 = affine.apply #map5(%74) +# CHECK-NEXT: %78 = affine.apply #map9(%67, %74) +# CHECK-NEXT: %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %cast_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %cast_33 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x1x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: scf.yield %11 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c4_12 = arith.constant 4 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %7 = affine.apply #map16(%arg5) +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c4_16 = arith.constant 4 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %9 = affine.apply #map16(%arg7) +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_28 = arith.constant 0 : index +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> +# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_34 = arith.constant 0 : index +# CHECK-NEXT: %c5_35 = arith.constant 5 : index +# CHECK-NEXT: %c1_36 = arith.constant 1 : index +# CHECK-NEXT: %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_41 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_42 = arith.constant 1 : index +# CHECK-NEXT: %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_48: f32, %out: f32): +# CHECK-NEXT: %15 = arith.mulf %in, %in_48 : f32 +# CHECK-NEXT: %16 = arith.addf %out, %15 : f32 +# CHECK-NEXT: linalg.yield %16 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)> +# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)> +# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)> +# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)> +# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)> +# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)> +# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)> +# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)> +# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)> +# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c1_0 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %6 = arith.cmpi eq, %c8, %c0_8 : index +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c8_10 = arith.constant 8 : index +# CHECK-NEXT: %7 = arith.cmpi eq, %c8_10, %c0_9 : index +# CHECK-NEXT: %8 = arith.ori %7, %6 : i1 +# CHECK-NEXT: %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x12x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32> +# CHECK-NEXT: %10 = tensor.empty() : tensor<1x12x12x3xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) { +# CHECK-NEXT: %12 = affine.apply #map(%arg5) +# CHECK-NEXT: %13 = affine.max #map1(%arg5) +# CHECK-NEXT: %14 = affine.apply #map2(%arg5) +# CHECK-NEXT: %15 = affine.max #map3(%arg5) +# CHECK-NEXT: %16 = affine.min #map4(%15) +# CHECK-NEXT: %17 = affine.apply #map5(%13) +# CHECK-NEXT: %18 = affine.apply #map6(%16) +# CHECK-NEXT: %19 = affine.min #map7(%16, %13) +# CHECK-NEXT: %20 = affine.max #map8(%19) +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %21 = arith.cmpi eq, %20, %c0_13 : index +# CHECK-NEXT: %22 = affine.apply #map5(%20) +# CHECK-NEXT: %23 = affine.apply #map9(%13, %20) +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c8_15 = arith.constant 8 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %c8_15, %c0_14 : index +# CHECK-NEXT: %25 = arith.ori %24, %21 : i1 +# CHECK-NEXT: %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32> +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %27 = tensor.empty() : tensor<1x1x12x3xf32> +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c12_21 = arith.constant 12 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) { +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %29 = affine.max #map10(%13) +# CHECK-NEXT: %30 = affine.apply #map11(%13) +# CHECK-NEXT: %31 = affine.max #map12(%13) +# CHECK-NEXT: %32 = affine.min #map13(%31, %20) +# CHECK-NEXT: %33 = affine.apply #map5(%29) +# CHECK-NEXT: %34 = affine.apply #map14(%20, %32) +# CHECK-NEXT: %35 = affine.min #map15(%20, %32, %29) +# CHECK-NEXT: %36 = affine.max #map8(%35) +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %37 = arith.cmpi eq, %36, %c0_24 : index +# CHECK-NEXT: %38 = affine.apply #map5(%36) +# CHECK-NEXT: %39 = affine.apply #map9(%29, %36) +# CHECK-NEXT: %40 = affine.apply #map(%arg7) +# CHECK-NEXT: %41 = affine.max #map1(%arg7) +# CHECK-NEXT: %42 = affine.apply #map2(%arg7) +# CHECK-NEXT: %43 = affine.max #map3(%arg7) +# CHECK-NEXT: %44 = affine.min #map4(%43) +# CHECK-NEXT: %45 = affine.apply #map5(%41) +# CHECK-NEXT: %46 = affine.apply #map6(%44) +# CHECK-NEXT: %47 = affine.min #map7(%44, %41) +# CHECK-NEXT: %48 = affine.max #map8(%47) +# CHECK-NEXT: %c0_25 = arith.constant 0 : index +# CHECK-NEXT: %49 = arith.cmpi eq, %48, %c0_25 : index +# CHECK-NEXT: %50 = arith.ori %49, %37 : i1 +# CHECK-NEXT: %51 = affine.apply #map5(%48) +# CHECK-NEXT: %52 = affine.apply #map9(%41, %48) +# CHECK-NEXT: %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32> +# CHECK-NEXT: %c1_28 = arith.constant 1 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %54 = tensor.empty() : tensor<1x1x1x3xf32> +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %c2_30 = arith.constant 2 : index +# CHECK-NEXT: %c0_31 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_32 = arith.constant 1 : index +# CHECK-NEXT: %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) { +# CHECK-NEXT: %c1_34 = arith.constant 1 : index +# CHECK-NEXT: %56 = affine.max #map10(%29) +# CHECK-NEXT: %57 = affine.apply #map11(%29) +# CHECK-NEXT: %58 = affine.max #map12(%29) +# CHECK-NEXT: %59 = affine.min #map13(%58, %36) +# CHECK-NEXT: %60 = affine.apply #map5(%56) +# CHECK-NEXT: %61 = affine.apply #map14(%36, %59) +# CHECK-NEXT: %62 = affine.min #map15(%36, %59, %56) +# CHECK-NEXT: %63 = affine.max #map8(%62) +# CHECK-NEXT: %c0_35 = arith.constant 0 : index +# CHECK-NEXT: %64 = arith.cmpi eq, %63, %c0_35 : index +# CHECK-NEXT: %65 = affine.apply #map5(%63) +# CHECK-NEXT: %66 = affine.apply #map9(%56, %63) +# CHECK-NEXT: %c2_36 = arith.constant 2 : index +# CHECK-NEXT: %67 = affine.max #map10(%41) +# CHECK-NEXT: %68 = affine.apply #map11(%41) +# CHECK-NEXT: %69 = affine.max #map12(%41) +# CHECK-NEXT: %70 = affine.min #map13(%69, %48) +# CHECK-NEXT: %71 = affine.apply #map5(%67) +# CHECK-NEXT: %72 = affine.apply #map14(%48, %70) +# CHECK-NEXT: %73 = affine.min #map15(%48, %70, %67) +# CHECK-NEXT: %74 = affine.max #map8(%73) +# CHECK-NEXT: %c0_37 = arith.constant 0 : index +# CHECK-NEXT: %75 = arith.cmpi eq, %74, %c0_37 : index +# CHECK-NEXT: %76 = arith.ori %75, %64 : i1 +# CHECK-NEXT: %77 = affine.apply #map5(%74) +# CHECK-NEXT: %78 = affine.apply #map9(%67, %74) +# CHECK-NEXT: %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1x1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] { +# CHECK-NEXT: ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32> +# CHECK-NEXT: %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %cast_40 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: scf.yield %cast_33 : tensor<1x1x1x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x1x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: scf.yield %11 : tensor<1x12x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x12x12x3xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<1x4x4x16xf32> +# CHECK-NEXT: %cst_1 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %c1_4 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_9 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c4_12 = arith.constant 4 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c1_6 = arith.constant 1 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32> +# CHECK-NEXT: %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32> +# CHECK-NEXT: %c0_10 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) { +# CHECK-NEXT: %7 = affine.apply #map16(%arg5) +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c4_16 = arith.constant 4 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) { +# CHECK-NEXT: %9 = affine.apply #map16(%arg7) +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32> +# CHECK-NEXT: %c0_22 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_23 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) { +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_28 = arith.constant 0 : index +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c1_29 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32> +# CHECK-NEXT: %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32> +# CHECK-NEXT: %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_34 = arith.constant 0 : index +# CHECK-NEXT: %c5_35 = arith.constant 5 : index +# CHECK-NEXT: %c1_36 = arith.constant 1 : index +# CHECK-NEXT: %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32> +# CHECK-NEXT: %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32> +# CHECK-NEXT: %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %c0_41 = arith.constant 0 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c1_42 = arith.constant 1 : index +# CHECK-NEXT: %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) { +# CHECK-NEXT: %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32> +# CHECK-NEXT: %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_48: f32, %out: f32): +# CHECK-NEXT: %15 = arith.mulf %in, %in_48 : f32 +# CHECK-NEXT: %16 = arith.addf %out, %15 : f32 +# CHECK-NEXT: linalg.yield %16 : f32 +# CHECK-NEXT: } -> tensor<1x1x1x1xf32> +# CHECK-NEXT: %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<1x4x4x16xf32> +# CHECK-NEXT: } {"./b"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2, 0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, d0 - 2)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (8, d0)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0, 0)> +# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d1, d0)> +# CHECK-NEXT: #map7 = affine_map<(d0, d1, d2) -> (-d2 + 1, d0 - d1)> +# CHECK-NEXT: #map8 = affine_map<(d0, d1) -> (-d0 - d1 + 1)> +# CHECK-NEXT: #map9 = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)> +# CHECK-NEXT: #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +# CHECK-NEXT: #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %c5 = arith.constant 5 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c3 = arith.constant 3 : index +# CHECK-NEXT: %c12 = arith.constant 12 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32> +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c12 step %c1 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) { +# CHECK-NEXT: %3 = affine.max #map(%arg3) +# CHECK-NEXT: %4 = affine.max #map1(%arg3) +# CHECK-NEXT: %5 = affine.min #map2(%4) +# CHECK-NEXT: %6 = affine.min #map3(%5, %3) +# CHECK-NEXT: %7 = affine.max #map4(%6) +# CHECK-NEXT: %8 = arith.cmpi eq, %7, %c0 : index +# CHECK-NEXT: %9 = scf.if %8 -> (memref<1x1x12x3xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca : memref<1x1x12x3xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %10 = linalg.index 0 : index +# CHECK-NEXT: %11 = linalg.index 1 : index +# CHECK-NEXT: %12 = linalg.index 2 : index +# CHECK-NEXT: %13 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca : memref<1x1x12x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_2 = memref.subview %arg0[0, %5, 0, 0] [1, %7, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_3, %alloca_0 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32> +# CHECK-NEXT: %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32> +# CHECK-NEXT: %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32> +# CHECK-NEXT: %10 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %alloca_0) -> (memref<1x1x12x3xf32>) { +# CHECK-NEXT: %11 = affine.max #map5(%3) +# CHECK-NEXT: %12 = affine.min #map6(%11, %7) +# CHECK-NEXT: %13 = affine.min #map7(%7, %12, %3) +# CHECK-NEXT: %14 = affine.max #map4(%13) +# CHECK-NEXT: %15 = arith.cmpi eq, %14, %c0 : index +# CHECK-NEXT: %16 = affine.max #map(%arg5) +# CHECK-NEXT: %17 = affine.max #map1(%arg5) +# CHECK-NEXT: %18 = affine.min #map2(%17) +# CHECK-NEXT: %19 = affine.min #map3(%18, %16) +# CHECK-NEXT: %20 = affine.max #map4(%19) +# CHECK-NEXT: %21 = arith.cmpi eq, %20, %c0 : index +# CHECK-NEXT: %22 = arith.ori %21, %15 : i1 +# CHECK-NEXT: %23 = scf.if %22 -> (memref<1x1x1x3xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_4 : memref<1x1x1x3xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %24 = linalg.index 0 : index +# CHECK-NEXT: %25 = linalg.index 1 : index +# CHECK-NEXT: %26 = linalg.index 2 : index +# CHECK-NEXT: %27 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_4 : memref<1x1x1x3xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_7 = memref.subview %subview_2[0, %12, %18, 0] [1, %14, %20, 3] [1, 1, 1, 1] : memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %alloca_5 : memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32> +# CHECK-NEXT: %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32> +# CHECK-NEXT: %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32> +# CHECK-NEXT: %24 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %alloca_5) -> (memref<1x1x1x3xf32>) { +# CHECK-NEXT: %25 = affine.min #map6(%11, %14) +# CHECK-NEXT: %26 = affine.min #map7(%14, %25, %3) +# CHECK-NEXT: %27 = affine.max #map4(%26) +# CHECK-NEXT: %28 = arith.cmpi eq, %27, %c0 : index +# CHECK-NEXT: %29 = affine.apply #map8(%3, %27) +# CHECK-NEXT: %30 = affine.max #map5(%16) +# CHECK-NEXT: %31 = affine.min #map6(%30, %20) +# CHECK-NEXT: %32 = affine.min #map7(%20, %31, %16) +# CHECK-NEXT: %33 = affine.max #map4(%32) +# CHECK-NEXT: %34 = arith.cmpi eq, %33, %c0 : index +# CHECK-NEXT: %35 = arith.ori %34, %28 : i1 +# CHECK-NEXT: %36 = affine.apply #map8(%16, %33) +# CHECK-NEXT: %37 = scf.if %35 -> (memref<1x1x1x1xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_9 : memref<1x1x1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %38 = linalg.index 0 : index +# CHECK-NEXT: %39 = linalg.index 1 : index +# CHECK-NEXT: %40 = linalg.index 2 : index +# CHECK-NEXT: %41 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_9 : memref<1x1x1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[0, %25, %31, %arg7] [1, %27, %33, 1] [1, 1, 1, 1] : memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: linalg.map outs(%alloca_10 : memref<1x1x1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %38 = linalg.index 0 : index +# CHECK-NEXT: %39 = linalg.index 1 : index +# CHECK-NEXT: %40 = linalg.index 2 : index +# CHECK-NEXT: %41 = linalg.index 3 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %dim = memref.dim %subview_12, %c1_13 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %dim_14 = memref.dim %subview_12, %c2 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %alloca_10[0, %3, %16, 0] [1, %dim, %dim_14, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_12, %subview_15 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>> +# CHECK-NEXT: scf.yield %alloca_10 : memref<1x1x1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_11 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %37, %subview_11 : memref<1x1x1x1xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x3xf32> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: scf.yield %24 : memref<1x1x1x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %23, %subview_6 : memref<1x1x1x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x12x3xf32> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: scf.yield %10 : memref<1x1x12x3xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: memref.copy %9, %subview_1 : memref<1x1x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x12x12x3xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %3 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_conv_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) +# CHECK-NEXT: %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_4 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %3, %subview_2 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: %subview = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) { +# CHECK-NEXT: %3 = affine.apply #map9(%arg3) +# CHECK-NEXT: %subview_1 = memref.subview %subview[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_2) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %5 = affine.apply #map9(%arg5) +# CHECK-NEXT: %subview_4 = memref.subview %subview_1[0, 0, %5, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %6 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_5) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_7 = memref.subview %arg1[0, 0, 0, %arg7] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: %7 = scf.for %arg9 = %c0 to %c5 step %c1 iter_args(%arg10 = %subview_8) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_10 = memref.subview %subview_4[0, %arg9, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_11 = memref.subview %subview_7[%arg9, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %8 = scf.for %arg11 = %c0 to %c5 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %subview_10[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_11[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: %9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_14 = memref.subview %subview_12[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_13[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>> +# CHECK-NEXT: linalg.generic {indexing_maps = [#map10, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs = {__xtc_id_conv_} { +# CHECK-NEXT: ^bb0(%in: f32, %in_16: f32, %out: f32): +# CHECK-NEXT: %10 = arith.mulf %in, %in_16 : f32 +# CHECK-NEXT: %11 = arith.addf %out, %10 : f32 +# CHECK-NEXT: linalg.yield %11 : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./c"} +# CHECK-NEXT: scf.yield %9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./s"} +# CHECK-NEXT: scf.yield %8 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./r"} +# CHECK-NEXT: %subview_9 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./f"} +# CHECK-NEXT: %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %6, %subview_6 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: } {"./w"} +# CHECK-NEXT: %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_3 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<1x4x4x16xf32> +# CHECK-NEXT: } {"./h"} +# CHECK-NEXT: memref.copy %2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: pad_conv2d_nhwc_mini +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 1x8x8x3xfloat32 +# CHECK-NEXT: - %1 : 5x5x3x16xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %3 : 1x4x4x16xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: pad2d(%0, padding={1: (2, 2), 2: (2, 2)}, constant_value=0) {name = 'pad'} : [1x8x8x3xfloat32] -> [1x12x12x3xfloat32] +# CHECK-NEXT: - %3: conv2d(%2, %1, stride=(2, 2)) {name = 'conv'} : [1x12x12x3xfloat32, 5x5x3x16xfloat32] -> [1x4x4x16xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py new file mode 100644 index 00000000..c5e42160 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py @@ -0,0 +1,711 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 14, 14, 14, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="pad_matmul_unpad") as gb: + p1 = O.pad(a, padding=(0, 2), name="A_pad") + p2 = O.pad(b, padding=(0, 2), name="B_pad") + m_pad = O.matmul(p1, p2, name="matmul_padded") + O.unpad(m_pad, padding=(0, 2), name="C") +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) +sch = impl.get_scheduler(default_node="matmul_padded") +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="gen_pad_tuple_matmul_unpad_mlir", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %padded = tensor.pad %arg0 nofold low[0, 0] high[2, 2] { +# CHECK-NEXT: ^bb0(%arg3: index, %arg4: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_A_pad_} : tensor<14x14xf32> to tensor<16x16xf32> +# CHECK-NEXT: %1 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %padded_1 = tensor.pad %arg1 nofold low[0, 0] high[2, 2] { +# CHECK-NEXT: ^bb0(%arg3: index, %arg4: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } {__xtc_id_B_pad_} : tensor<14x14xf32> to tensor<16x16xf32> +# CHECK-NEXT: %2 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %3 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %4 = linalg.matmul {__xtc_id_matmul_padded_} ins(%padded, %padded_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%3 : tensor<16x16xf32>) -> tensor<16x16xf32> +# CHECK-NEXT: %5 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./k" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_A_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c16_2 = arith.constant 16 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } {__xtc_id_B_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %6 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c16_6 = arith.constant 16 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16_13 = arith.constant 16 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c16_9 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c16_15 = arith.constant 16 : index +# CHECK-NEXT: %c1_16 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c16_21 = arith.constant 16 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_27 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %9 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)> +# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)> +# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)> +# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst : f32 +# CHECK-NEXT: } {__xtc_id_A_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %c0_1 = arith.constant 0 : index +# CHECK-NEXT: %c16_2 = arith.constant 16 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %10 = affine.min #map(%arg3) +# CHECK-NEXT: %11 = affine.apply #map1(%10) +# CHECK-NEXT: %12 = affine.min #map2(%10) +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %13 = arith.cmpi eq, %12, %c0_11 : index +# CHECK-NEXT: %14 = affine.apply #map3(%12) +# CHECK-NEXT: %15 = affine.apply #map3(%12) +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c14 = arith.constant 14 : index +# CHECK-NEXT: %16 = arith.cmpi eq, %c14, %c0_12 : index +# CHECK-NEXT: %17 = arith.ori %16, %13 : i1 +# CHECK-NEXT: %18 = scf.if %17 -> (tensor<1x16xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg5: index, %arg6: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x16xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %19 = tensor.empty() : tensor<1x16xf32> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c0_16 = arith.constant 0 : index +# CHECK-NEXT: %c16_17 = arith.constant 16 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %c0_19 = arith.constant 0 : index +# CHECK-NEXT: %21 = affine.min #map4(%12) +# CHECK-NEXT: %22 = affine.apply #map5(%12, %21) +# CHECK-NEXT: %23 = affine.min #map6(%12, %21) +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %24 = arith.cmpi eq, %23, %c0_20 : index +# CHECK-NEXT: %25 = affine.apply #map3(%23) +# CHECK-NEXT: %26 = affine.apply #map3(%23) +# CHECK-NEXT: %27 = affine.min #map(%arg5) +# CHECK-NEXT: %28 = affine.apply #map1(%27) +# CHECK-NEXT: %29 = affine.min #map2(%27) +# CHECK-NEXT: %c0_21 = arith.constant 0 : index +# CHECK-NEXT: %30 = arith.cmpi eq, %29, %c0_21 : index +# CHECK-NEXT: %31 = arith.ori %30, %24 : i1 +# CHECK-NEXT: %32 = affine.apply #map3(%29) +# CHECK-NEXT: %33 = affine.apply #map3(%29) +# CHECK-NEXT: %34 = scf.if %31 -> (tensor<1x1xf32>) { +# CHECK-NEXT: %generated = tensor.generate { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } : tensor<1x1xf32> +# CHECK-NEXT: scf.yield %generated : tensor<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor to tensor +# CHECK-NEXT: %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] { +# CHECK-NEXT: ^bb0(%arg7: index, %arg8: index): +# CHECK-NEXT: tensor.yield %cst_0 : f32 +# CHECK-NEXT: } {__xtc_id_B_pad_} : tensor to tensor +# CHECK-NEXT: %cast_24 = tensor.cast %padded : tensor to tensor<1x1xf32> +# CHECK-NEXT: scf.yield %cast_24 : tensor<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_22 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: scf.yield %cast : tensor<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %6 = tensor.empty() : tensor<16x16xf32> +# CHECK-NEXT: %cst_4 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_5 = arith.constant 0 : index +# CHECK-NEXT: %c16_6 = arith.constant 16 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_12 = arith.constant 0 : index +# CHECK-NEXT: %c16_13 = arith.constant 16 : index +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_16 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_8 = arith.constant 0 : index +# CHECK-NEXT: %c16_9 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) { +# CHECK-NEXT: %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32> +# CHECK-NEXT: %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: %c16_15 = arith.constant 16 : index +# CHECK-NEXT: %c1_16 = arith.constant 1 : index +# CHECK-NEXT: %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) { +# CHECK-NEXT: %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32> +# CHECK-NEXT: %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32> +# CHECK-NEXT: %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_20 = arith.constant 0 : index +# CHECK-NEXT: %c16_21 = arith.constant 16 : index +# CHECK-NEXT: %c1_22 = arith.constant 1 : index +# CHECK-NEXT: %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_27 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %9 = tensor.empty() : tensor<14x14xf32> +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (14, d0)> +# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14, 1)> +# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 0, 1)> +# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (1, d0 - d1)> +# CHECK-NEXT: #map4 = affine_map<(d0) -> (-d0 + 1)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) { +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) { +# CHECK-NEXT: %4 = affine.min #map(%arg3) +# CHECK-NEXT: %5 = affine.min #map1(%4) +# CHECK-NEXT: %6 = arith.cmpi eq, %5, %c0 : index +# CHECK-NEXT: %7 = scf.if %6 -> (memref<1x16xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_1 : memref<1x16xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %8 = linalg.index 0 : index +# CHECK-NEXT: %9 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_1 : memref<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_7 = memref.subview %arg0[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref> +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %alloca_2 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32> +# CHECK-NEXT: %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_2) -> (memref<1x16xf32>) { +# CHECK-NEXT: %9 = affine.min #map2(%4) +# CHECK-NEXT: %10 = affine.min #map3(%5, %9) +# CHECK-NEXT: %11 = arith.cmpi eq, %10, %c0 : index +# CHECK-NEXT: %12 = affine.apply #map4(%10) +# CHECK-NEXT: %13 = affine.min #map(%arg5) +# CHECK-NEXT: %14 = affine.min #map1(%13) +# CHECK-NEXT: %15 = arith.cmpi eq, %14, %c0 : index +# CHECK-NEXT: %16 = arith.ori %15, %11 : i1 +# CHECK-NEXT: %17 = affine.apply #map4(%14) +# CHECK-NEXT: %18 = scf.if %16 -> (memref<1x1xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_9 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_9 : memref<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref> to memref> +# CHECK-NEXT: linalg.map outs(%alloca_10 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %dim = memref.dim %subview_12, %c0_13 : memref> +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %dim_15 = memref.dim %subview_12, %c1_14 : memref> +# CHECK-NEXT: %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref> +# CHECK-NEXT: memref.copy %subview_12, %subview_16 : memref> to memref> +# CHECK-NEXT: scf.yield %alloca_10 : memref<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: scf.yield %8 : memref<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32> +# CHECK-NEXT: %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32> +# CHECK-NEXT: %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_3) -> (memref<16x16xf32>) { +# CHECK-NEXT: %4 = affine.min #map(%arg3) +# CHECK-NEXT: %5 = affine.min #map1(%4) +# CHECK-NEXT: %6 = arith.cmpi eq, %5, %c0 : index +# CHECK-NEXT: %7 = scf.if %6 -> (memref<1x16xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_4 : memref<1x16xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %8 = linalg.index 0 : index +# CHECK-NEXT: %9 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_4 : memref<1x16xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_7 = memref.subview %arg1[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref> +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %alloca_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32> +# CHECK-NEXT: %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32> +# CHECK-NEXT: %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_5) -> (memref<1x16xf32>) { +# CHECK-NEXT: %9 = affine.min #map2(%4) +# CHECK-NEXT: %10 = affine.min #map3(%5, %9) +# CHECK-NEXT: %11 = arith.cmpi eq, %10, %c0 : index +# CHECK-NEXT: %12 = affine.apply #map4(%10) +# CHECK-NEXT: %13 = affine.min #map(%arg5) +# CHECK-NEXT: %14 = affine.min #map1(%13) +# CHECK-NEXT: %15 = arith.cmpi eq, %14, %c0 : index +# CHECK-NEXT: %16 = arith.ori %15, %11 : i1 +# CHECK-NEXT: %17 = affine.apply #map4(%14) +# CHECK-NEXT: %18 = scf.if %16 -> (memref<1x1xf32>) { +# CHECK-NEXT: linalg.map outs(%alloca_9 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: scf.yield %alloca_9 : memref<1x1xf32> +# CHECK-NEXT: } else { +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref> to memref> +# CHECK-NEXT: linalg.map outs(%alloca_10 : memref<1x1xf32>) +# CHECK-NEXT: () { +# CHECK-NEXT: %19 = linalg.index 0 : index +# CHECK-NEXT: %20 = linalg.index 1 : index +# CHECK-NEXT: linalg.yield %cst : f32 +# CHECK-NEXT: } +# CHECK-NEXT: %c0_13 = arith.constant 0 : index +# CHECK-NEXT: %dim = memref.dim %subview_12, %c0_13 : memref> +# CHECK-NEXT: %c1_14 = arith.constant 1 : index +# CHECK-NEXT: %dim_15 = memref.dim %subview_12, %c1_14 : memref> +# CHECK-NEXT: %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref> +# CHECK-NEXT: memref.copy %subview_12, %subview_16 : memref> to memref> +# CHECK-NEXT: scf.yield %alloca_10 : memref<1x1xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: scf.yield %8 : memref<1x16xf32> +# CHECK-NEXT: } +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_6) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_8 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_8, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_7 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %2) -> (memref<16x16xf32>) { +# CHECK-NEXT: %subview_6 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_7) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_9 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) { +# CHECK-NEXT: %subview_12 = memref.subview %subview_6[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_8 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg4 : memref<16x16xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %subview = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>> +# CHECK-NEXT: memref.copy %subview, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: pad_matmul_unpad +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 14x14xfloat32 +# CHECK-NEXT: - %1 : 14x14xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %5 : 14x14xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: pad(%0, padding=(0, 2), constant_value=0) {name = 'A_pad'} : [14x14xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: - %3: pad(%1, padding=(0, 2), constant_value=0) {name = 'B_pad'} : [14x14xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: - %4: matmul(%2, %3) {name = 'matmul_padded'} : [16x16xfloat32, 16x16xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: - %5: unpad(%4, padding=(0, 2)) {name = 'C'} : [16x16xfloat32] -> [14x14xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py new file mode 100644 index 00000000..ac4cee48 --- /dev/null +++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py @@ -0,0 +1,392 @@ +# RUN: python %s 2>&1 | filecheck %s +# UNSUPPORTED: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir import Backend + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") +c = O.tensor((J, I), dtype, name="C") + +with O.graph(name="matmul") as gb: + d = O.matmul(a, b, name="D") + O.matmul(c, d, name="E") + +graph = gb.graph +print(graph) + +impl = Backend(graph, use_tensor_dialect=True) + +sch = impl.get_scheduler(default_node = "E") +sched = sch.schedule() + +comp = impl.get_compiler( + shared_lib=True, + dump_file="two_matmul_mlir_tensor", + print_source_ir=True, + print_transformed_ir=True, + print_bufferization_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") + +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %1 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %2 = linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32> +# CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> +# CHECK-NEXT: %cst_0 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %4 = linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%3 : tensor<32x32xf32>) -> tensor<32x32xf32> +# CHECK-NEXT: %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %2 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32> +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_D_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_D_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_E_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_11 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_E_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_13 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_15 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_17 "./k" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> +# CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c32_7 = arith.constant 32 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c4_18 = arith.constant 4 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %0 = tensor.empty() : tensor<4x32xf32> +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c1_2 = arith.constant 1 : index +# CHECK-NEXT: %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_18 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_23 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_19 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = tensor.empty() : tensor<32x32xf32> +# CHECK-NEXT: %cst_3 = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c32_10 = arith.constant 32 : index +# CHECK-NEXT: %c1_11 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_13 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c32_7 = arith.constant 32 : index +# CHECK-NEXT: %c1_8 = arith.constant 1 : index +# CHECK-NEXT: %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) { +# CHECK-NEXT: %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32> +# CHECK-NEXT: %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c32_12 = arith.constant 32 : index +# CHECK-NEXT: %c1_13 = arith.constant 1 : index +# CHECK-NEXT: %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) { +# CHECK-NEXT: %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32> +# CHECK-NEXT: %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32> +# CHECK-NEXT: %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c4_18 = arith.constant 4 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) { +# CHECK-NEXT: %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32> +# CHECK-NEXT: %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32> +# CHECK-NEXT: %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32> +# CHECK-NEXT: scf.yield %inserted_slice_24 : tensor<1x1xf32> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice_20 : tensor<1x32xf32> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32> +# CHECK-NEXT: scf.yield %inserted_slice : tensor<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> () +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) { +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32> +# CHECK-NEXT: %0 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %alloca) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (memref<4x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %arg1[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg8 = %c0 to %c512 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_D_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<4x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %2 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_E_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %3 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %2) -> (memref<32x32xf32>) { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_2 = memref.subview %1[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32> to memref<4x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %5 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) { +# CHECK-NEXT: %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>> +# CHECK-NEXT: %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_E_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: scf.yield %arg5 : memref<32x32xf32> +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: memref.copy %3, %arg3 : memref<32x32xf32> to memref<32x32xf32> +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: - %2 : 32x4xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %4 : 32x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %3: matmul(%0, %1) {name = 'D'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: - %4: matmul(%2, %3) {name = 'E'} : [32x4xfloat32, 4x32xfloat32] -> [32x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0