diff --git a/Makefile b/Makefile index 9ec20a8d8..78fe54df8 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,9 @@ check-lit-c: check-lit-nvgpu: [ `uname -s` = Darwin ] || env XTC_MLIR_TARGET=nvgpu lit -v tests/filecheck/backends tests/filecheck/mlir_loop tests/filecheck/evaluation +check-lit-mppa: + env XTC_MLIR_TARGET=mppa lit -v -j 1 tests/filecheck/backends/target_mppa + check-pytest: scripts/pytest/run_pytest.sh -v diff --git a/sdist_requirements.txt b/sdist_requirements.txt index 0cf8abdf4..b7fd6d145 100644 --- a/sdist_requirements.txt +++ b/sdist_requirements.txt @@ -1,4 +1,4 @@ --index-url https://gitlab.inria.fr/api/v4/groups/corse/-/packages/pypi/simple -mlir-sdist==21.1.2.2026012001 +mlir-sdist==21.1.2.2026021601 mlir==21.1.2.2025091603 xtc-mlir==21.1.2.2 diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py index b4c9bfe34..fa85fd00c 100644 --- a/src/xtc/backends/mlir/MlirCompiler.py +++ b/src/xtc/backends/mlir/MlirCompiler.py @@ -28,13 +28,14 @@ get_default_target, ) from xtc.utils.ext_tools import get_shlib_extension +from xtc.itf.runtime.common import CommonRuntimeInterface class MlirCompiler(itf.comp.Compiler): def __init__( self, backend: "backend.MlirBackend", - target: str | None = None, + target: str | CommonRuntimeInterface | None = None, **kwargs: Any, ): self._backend = backend @@ -44,9 +45,12 @@ def __init__( self._config = MlirConfig(**kwargs) if target is None: self._target = get_default_target()(self._config) - else: + elif isinstance(target, str): self._target = get_target_from_name(target)(self._config) + elif isinstance(target, CommonRuntimeInterface): + self._target = get_target_from_name(target.target_name())(self._config) assert self._target is not None + self._runtime_target = target self._compiler_kwargs = kwargs @property @@ -136,6 +140,7 @@ def mlir_insert_transform_pass(self) -> None: concluding_passes=self._config.concluding_passes, always_vectorize=self._config.always_vectorize, vectors_size=self._config.vectors_size, + target=self._target, ) insert_transform_pass.run() if self._config.print_source_ir: @@ -157,6 +162,8 @@ def _save_temp(self, fname: str, content: Any) -> None: outf.write(str(content)) def _register_mlir_extensions(self) -> None: + for extension in self._config.required_extensions: + self._mlir_program.require_extension(extension, weak=False) if self._mlir_schedule is not None: for extension, weak in self._mlir_schedule.mlir_extensions.items(): self._mlir_program.require_extension(extension, weak=weak) diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py index de33ff28d..10d80eb55 100644 --- a/src/xtc/backends/mlir/MlirCompilerPasses.py +++ b/src/xtc/backends/mlir/MlirCompilerPasses.py @@ -38,6 +38,7 @@ from .MlirProgram import RawMlirProgram from .MlirScheduler import MlirSchedule, MlirNodeSchedule +from .MlirTarget import MlirTarget _VECTO_SEQ_NAME = "_vecto" _SUPER_VECTORIZE_SEQ_NAME = "_super_vectorize" @@ -98,12 +99,14 @@ class MlirProgramInsertTransformPass: def __init__( self, mlir_program: RawMlirProgram, + target: MlirTarget, mlir_schedule: MlirSchedule | None = None, concluding_passes: list[str] = [], always_vectorize: bool = True, vectors_size: int | None = None, ) -> None: self._mlir_program = mlir_program + self._target = target self._mlir_schedule = mlir_schedule self._loc = Location.unknown(self._mlir_program.mlir_context) self._concluding_passes = concluding_passes @@ -428,12 +431,15 @@ def _vectorize(self, sched_state: SchedulingState): if self._vectors_size is not None: return - transform.IncludeOp( - results_=[], - target=_VECTO_SEQ_NAME, - failure_propagation_mode=2, - operands_=[sched_state.handle], - ) + if self._target.custom_vectorize(): + self._target.apply_custom_vectorize(sched_state.handle) + else: + transform.IncludeOp( + results_=[], + target=_VECTO_SEQ_NAME, + failure_propagation_mode=2, + operands_=[sched_state.handle], + ) def _post_vectorize(self, sched_state: SchedulingState): if self._vectors_size is not None: diff --git a/src/xtc/backends/mlir/MlirConfig.py b/src/xtc/backends/mlir/MlirConfig.py index 2d0ab5128..0823ebfd9 100644 --- a/src/xtc/backends/mlir/MlirConfig.py +++ b/src/xtc/backends/mlir/MlirConfig.py @@ -30,6 +30,7 @@ class MlirConfig: arch: str = "native" cpu: str = "native" selected_device: int | None = None + required_extensions: list[str] = field(default_factory=list) def __post_init__(self): object.__setattr__( diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py index 191cad027..ca9650e48 100644 --- a/src/xtc/backends/mlir/MlirGraphBackend.py +++ b/src/xtc/backends/mlir/MlirGraphBackend.py @@ -7,7 +7,14 @@ from xdsl.dialects.func import FuncOp as xdslFuncOp from xdsl.dialects import func, memref -from xdsl.dialects.builtin import MemRefType, f32, f64 +from xdsl.dialects.builtin import ( + MemRefType, + f32, + f64, + ArrayAttr, + UnitAttr, + DictionaryAttr, +) from xdsl.ir import Region, Block, Operation from xdsl.builder import ImplicitBuilder @@ -97,6 +104,14 @@ def _init_from_graph( self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type)) for tensor_type in [*inputs_types, *outputs_types] ] + arg_attrs = ArrayAttr( + [ + DictionaryAttr( + self._xdsl_attrs_from_tensortype(cast(XTCTensorType, tensor_type)) + ) + for tensor_type in [*inputs_types, *outputs_types] + ] + ) inlined_block = Block(arg_types=params_types) variables = { name: arg @@ -109,11 +124,11 @@ def _init_from_graph( with ImplicitBuilder(inlined_block): func.ReturnOp() region = Region([inlined_block]) # type: ignore # issue with mypy - payload = xdslFuncOp.from_region( + payload = xdslFuncOp( name=graph.name, - input_types=params_types, - return_types=[], + function_type=(params_types, []), region=region, + arg_attrs=arg_attrs, ) nodes_dict = {} for attrs in block_attrs: @@ -139,6 +154,11 @@ def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any: elt_type, shape = self._xdsl_elt_shape_from_tensortype(type) return MemRefType(elt_type, shape) + def _xdsl_attrs_from_tensortype(self, type: XTCTensorType): + if type.device is not None: + return {"memref.on_device": UnitAttr()} + return {} + def _np_types_spec( self, types: list[MemRefType] ) -> list[dict[str, tuple[int, ...] | str]]: diff --git a/src/xtc/backends/mlir/MlirProgram.py b/src/xtc/backends/mlir/MlirProgram.py index ec9e68b1b..a9a77c179 100644 --- a/src/xtc/backends/mlir/MlirProgram.py +++ b/src/xtc/backends/mlir/MlirProgram.py @@ -73,23 +73,26 @@ def parse_and_add_function( function, context=self.mlir_context ) - # Insert (or not) the noalias attributes - arg_attrs = [] - if no_alias: - for _ in payload_func.arguments: - dict_attr = DictAttr.get( - { - "llvm.noalias": UnitAttr.get(context=self.mlir_context), - }, - context=self.mlir_context, + with self.mlir_context: + # Insert (or not) the noalias attributes + new_arg_attrs = [] + if no_alias: + for arg_attrs in payload_func.arg_attrs: + new_dict = {} + for i in range(len(arg_attrs)): + new_dict[arg_attrs[i].name] = arg_attrs[i].attr + new_dict["llvm.noalias"] = UnitAttr.get(context=self.mlir_context) + new_arg_attrs.append( + DictAttr.get(new_dict, context=self.mlir_context) + ) + payload_func.arg_attrs = ArrayAttr.get( + new_arg_attrs, context=self.mlir_context ) - arg_attrs.append(dict_attr) - payload_func.arg_attrs = ArrayAttr.get(arg_attrs, context=self.mlir_context) - # Insert the function in the MLIR program - ip = InsertionPoint.at_block_begin(self.mlir_module.body) - ip.insert(payload_func) - name = str(payload_func.name).replace('"', "") - self.local_functions[str(name)] = payload_func + # Insert the function in the MLIR program + ip = InsertionPoint.at_block_begin(self.mlir_module.body) + ip.insert(payload_func) + name = str(payload_func.name).replace('"', "") + self.local_functions[str(name)] = payload_func return payload_func diff --git a/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py index 28d7f3dcc..b14174c88 100644 --- a/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py +++ b/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py @@ -36,6 +36,7 @@ from ..MlirProgram import RawMlirProgram from mlir.passmanager import PassManager +from mlir.ir import OpResult __all__ = ["MlirCTarget"] @@ -181,6 +182,14 @@ def create_module( ) -> itf.comp.Module: return HostModule(name, payload_name, file_name, file_type, graph, **kwargs) + @override + def custom_vectorize(self) -> bool: + return False + + @override + def apply_custom_vectorize(self, handle: OpResult) -> None: + return + def dump_ir(self, mlir_program: RawMlirProgram, title: str): print(f"// -----// {title} //----- //", file=sys.stderr) print(str(mlir_program.mlir_module), file=sys.stderr) diff --git a/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py index 60be1fef0..22a660bd0 100644 --- a/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py +++ b/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py @@ -37,6 +37,7 @@ from ..MlirProgram import RawMlirProgram from mlir.passmanager import PassManager +from mlir.ir import OpResult __all__ = ["MlirLLVMTarget"] @@ -176,6 +177,14 @@ def create_module( ) -> itf.comp.Module: return HostModule(name, payload_name, file_name, file_type, graph, **kwargs) + @override + def custom_vectorize(self) -> bool: + return False + + @override + def apply_custom_vectorize(self, handle: OpResult) -> None: + return + def dump_ir(self, mlir_program: RawMlirProgram, title: str): print(f"// -----// {title} //----- //", file=sys.stderr) print(str(mlir_program.mlir_module), file=sys.stderr) diff --git a/src/xtc/backends/mlir/MlirTarget/MlirMppaTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirMppaTarget.py new file mode 100644 index 000000000..43cc5ed81 --- /dev/null +++ b/src/xtc/backends/mlir/MlirTarget/MlirMppaTarget.py @@ -0,0 +1,461 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing_extensions import override +from typing import Any +import subprocess +import os +import sys +import tempfile +from pathlib import Path + +from xtc.utils.ext_tools import ( + get_shlib_extension, + runtime_libs, + system_libs, + cc_bin, +) + +from xtc.runtimes.accelerator.mppa import MppaConfig + +from xtc.targets.accelerator.mppa import MppaModule +import xtc.itf as itf +from xtc.itf.graph import Graph + +from .MlirTarget import MlirTarget +from ..MlirConfig import MlirConfig +from ..MlirProgram import RawMlirProgram + +from mlir.passmanager import PassManager +from mlir.ir import OpResult +from mlir.dialects import transform + +__all__ = ["MlirMppaTarget"] + + +class MlirMppaTarget(MlirTarget): + """Kalray MPPA Target + + This target implements the lowering and code generation to C + for the Kalray MPPA architecture, using the Mlir-Mppa backend. + """ + + def __init__(self, config: MlirConfig): + super().__init__(config) + # config.required_extensions.append("sdist") + self._mlir_mppa_backend = MlirMppaBackend(config) + + @override + def name(self) -> str: + return "mppa" + + @override + def arch(self) -> str: + return "kv3-2" + + @override + def generate_code_for_target( + self, + mlir_program: RawMlirProgram, # Will be modified in place + **kwargs: Any, + ) -> None: + save_temp = self._save_temp + save_temps_dir = self._config.save_temps_dir + temp_dir = None + dump_file = kwargs.get("dump_file", None) + if dump_file is None: + temp_dir = tempfile.mkdtemp() + dump_file = f"{temp_dir}/payload" + if self._config.save_temps: + assert dump_file is not None, "TODO: save_temp requires dump_file" + dump_tmp_dir = Path(save_temps_dir) + os.makedirs(save_temps_dir, exist_ok=True) + else: + dump_tmp_dir = Path(dump_file).parent + dump_base = Path(dump_file).name + + dump_tmp_file = f"{dump_tmp_dir}/{dump_base}" + mlir_atrn_dump_file = f"{dump_base}.after_trn.mlir" + mlir_bmppa_dump_file = f"{dump_base}.before_mppa.mlir" + mlir_amppa_dump_file = f"{dump_base}.after_mppa.mlir" + c_host_dump_file = f"{dump_base}.host.c" + c_accelerator_dump_file = f"{dump_base}.accelerator.c" + obj_host_dump_file = f"{dump_base}.host.o" + obj_accelerator_dump_file = f"{dump_base}.accelerator.o" + so_dump_file = f"{dump_file}.{get_shlib_extension()}" + kvx_so_dump_file = f"{dump_file}.kvx.so" + + # Lower to MLIR with MPPA dialect + save_temp(mlir_atrn_dump_file, mlir_program.mlir_module) + self._mlir_to_mppa_pass(mlir_program) + + # Run MLIR MPPA backend + with open(mlir_bmppa_dump_file, "w") as outf: + outf.write(str(mlir_program.mlir_module)) + self._mlir_mppa_backend.run_lowering( + mlir_before_mppa_dump_file=mlir_bmppa_dump_file, + mlir_after_mppa_dump_file=mlir_amppa_dump_file, + ) + if self._config.print_lowered_ir: + print(f"// -----// IR Dump After MPPA Opt //----- //", file=sys.stderr) + with open(mlir_amppa_dump_file, "r") as inf: + print(inf.read(), file=sys.stderr) + + # Generate C code for host and accelerator + self._mlir_mppa_backend.generate_c_host( + mlir_after_mppa_dump_file=mlir_amppa_dump_file, + c_host_dump_file=c_host_dump_file, + ) + self._mlir_mppa_backend.generate_c_accelerator( + mlir_after_mppa_dump_file=mlir_amppa_dump_file, + c_accelerator_dump_file=c_accelerator_dump_file, + ) + + # Compile C code for accelerator + self._mlir_mppa_backend.compile_c_accelerator( + c_accelerator_dump_file=c_accelerator_dump_file, + obj_accelerator_dump_file=obj_accelerator_dump_file, + ) + # Link KVX library + self._mlir_mppa_backend.link_kvx_library( + obj_accelerator_dump_file=obj_accelerator_dump_file, + kvx_so_dump_file=kvx_so_dump_file, + ) + + # Compile C code for host + self._mlir_mppa_backend.compile_c_host( + c_host_dump_file=c_host_dump_file, + obj_host_dump_file=obj_host_dump_file, + kvx_so_dump_file=kvx_so_dump_file, + ) + + # Link final shared library + self._mlir_mppa_backend.link_shared_library( + obj_host_dump_file=obj_host_dump_file, + obj_accelerator_dump_file=obj_accelerator_dump_file, + so_dump_file=so_dump_file, + ) + + # Remove intermediate files if needed + if not self._config.save_temps: + os.remove(mlir_bmppa_dump_file) + os.remove(mlir_amppa_dump_file) + os.remove(c_host_dump_file) + os.remove(c_accelerator_dump_file) + os.remove(obj_host_dump_file) + os.remove(obj_accelerator_dump_file) + + @override + def create_module( + self, + name: str, + payload_name: str, + file_name: str, + file_type: str, + graph: Graph | None = None, + **kwargs: Any, + ) -> itf.comp.Module: + mppa_config = MppaConfig(self._config) + return MppaModule( + name, payload_name, file_name, file_type, mppa_config, graph, **kwargs + ) + + @override + def custom_vectorize(self) -> bool: + return True + + @override + def apply_custom_vectorize(self, handle: OpResult) -> None: + transform.AnnotateOp(handle, "xtc.request_vectorization") + + def dump_ir(self, mlir_program: RawMlirProgram, title: str): + print(f"// -----// {title} //----- //", file=sys.stderr) + print(str(mlir_program.mlir_module), file=sys.stderr) + + def _mlir_to_mppa_pass(self, mlir_program: RawMlirProgram): + to_mppa_pass = MlirProgramToMlirMppaPass( + mlir_program=mlir_program, + ) + to_mppa_pass.run() + if self._config.print_lowered_ir: + self.dump_ir(mlir_program, "IR Dump After MLIR Opt") + + @property + def shared_libs(self): + return system_libs + [ + f"{self._config.mlir_install_dir}/lib/{lib}" for lib in runtime_libs + ] + + @property + def shared_path(self): + return [f"-Wl,-rpath,{self._config.mlir_install_dir}/lib/"] + + def _save_temp(self, fname: str, content: Any) -> None: + if not self._config.save_temps: + return + os.makedirs(self._config.save_temps_dir, exist_ok=True) + with open(f"{self._config.save_temps_dir}/{fname}", "w") as outf: + outf.write(str(content)) + + +class MlirProgramToMlirMppaPass: + def __init__( + self, + mlir_program: RawMlirProgram, + ) -> None: + self._mlir_program = mlir_program + + def _lowering_pipeline(self) -> list[str]: + pipeline = [ + "cse", + "sccp", + ] + if "sdist" in self._mlir_program.mlir_extensions: + pipeline += [ + "sdist-lower-distribution", + "cse", + "convert-sdist-to-mppa", + "cse", + "convert-sdist-utils-to-mppa", + "cse", + "canonicalize", + "cse", + ] + return pipeline + + def run(self) -> None: + self._mlir_program.mlir_context.allow_unregistered_dialects = True + pm = PassManager(context=self._mlir_program.mlir_context) + pm.enable_verifier(False) + for opt in self._lowering_pipeline(): + pm.add(opt) # type: ignore # no attribte add? + pm.run(self._mlir_program.mlir_module.operation) + self._mlir_program.mlir_context.allow_unregistered_dialects = False + + +class MlirMppaBackend: + def __init__(self, config: MlirConfig): + self._config = config + try: + import mlir_mppa + except ImportError: + raise ImportError( + "mlir_mppa is not installed but is required for MPPA target" + ) + try: + self._csw_path = os.environ["KALRAY_TOOLCHAIN_DIR"] + except KeyError: + raise KeyError( + "Please source the Kalray Accesscore Toolchain: https://www.kalrayinc.com/products/software/" + ) + self._mlir_mppa_path = mlir_mppa.__path__[0] + + @property + def cmd_mppa_opt(self): + return [f"{self._mlir_mppa_path}/bin/mppa-opt"] + + @property + def cmd_mppa_translate(self): + return [f"{self._mlir_mppa_path}/bin/mppa-translate"] + + @property + def cmd_kvx_cc(self): + return [f"{self._csw_path}/bin/kvx-cos-gcc"] + + @property + def cmd_host_cc(self): + return [cc_bin] + + def _execute_command( + self, + cmd: list[str], + input_pipe: str | None = None, + pipe_stdoutput: bool = True, + ) -> subprocess.CompletedProcess: + pretty_cmd = "| " if input_pipe else "" + pretty_cmd += " ".join(cmd) + if self._config.debug: + print(f"> exec: {pretty_cmd}", file=sys.stderr) + + if input_pipe and pipe_stdoutput: + result = subprocess.run( + cmd, input=input_pipe, stdout=subprocess.PIPE, text=True + ) + elif input_pipe and not pipe_stdoutput: + result = subprocess.run(cmd, input=input_pipe, text=True) + elif not input_pipe and pipe_stdoutput: + result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + else: + result = subprocess.run(cmd, text=True) + return result + + def _lowering_pipeline(self) -> str: + passes = [] + # TODO run these only if sdist is not present + # passes.append("func.func(mppa-launch{device=k300})") + # passes.append("func.func(kvxcluster-scf-forall-distribute{num-clusters=1})") + # passes.append("func.func(kvxcluster-launch)") + passes.append("canonicalize") + passes.append("func.func(mppa-load-weights)") + passes.append("func.func(mppa-copy-buffers)") + passes.append("canonicalize") + passes.append("func.func(kalray-lift-strided-memref-copy-to-linalg)") + passes.append("canonicalize") + passes.append("func.func(kvxcluster-lower-promoted-memory)") + passes.append( + "func.func(kvxcluster-optimize-dma-transfers{bundle=true pipeline=false})" + ) + passes.append("canonicalize") + passes.append("func.func(kvxcluster-basic-static-allocation)") + passes.append("canonicalize") + passes.append("func.func(kalray-remove-useless-initializations)") + passes.append("canonicalize") + passes.append("func.func(kvxpe-scf-forall-distribute{num-pes=1})") + passes.append("func.func(kvxpe-launch)") + passes.append( + "func.func(kvxuks-catch{request-attribute=xtc.request_vectorization})" + ) + passes.append("canonicalize") + passes.append("convert-linalg-to-loops") + passes.append("func.func(lower-affine)") + passes.append("func.func(expand-strided-metadata)") + passes.append("func.func(kvx-non-canonical-vectorize)") + passes.append("func.func(kvx-vectorize)") + passes.append("func.func(scf-forall-to-for)") + passes.append("convert-math-to-kvxisa") + passes.append("convert-math-to-libm") + passes.append("func.func(lower-affine)") + passes.append("cse") + # TODO Enable Mppa traces + ##if config.mppa_trace_enable: + ## passes.append("func.func(kalray-request-benchmarks{target-op=kvxcluster.launch})") + ## passes.append("kalray-apply-instrumentation{use-traces=" + str(config.mppa_trace_enable) + "}") + passes.append("func.func(kvxcluster-outline-kernels{specialize=true})") + passes.append("func.func(canonicalize)") + + new_passes = [] + for p in passes: + new_passes.append(p) + new_passes.append("cse") + # new_passes.append("canonicalize") # FIXME bug with kvxcluster.launch + + # No cse or canonicalize must run after + new_passes.append( + "func.func(kalray-clone-crossing-constants)" + ) # TODO remove remaining useless + passes = new_passes + + return "builtin.module(" + ",".join(passes) + ")" + + def run_lowering( + self, mlir_before_mppa_dump_file: str, mlir_after_mppa_dump_file: str + ) -> None: + cmd = self.cmd_mppa_opt + [ + "-pass-pipeline=" + self._lowering_pipeline(), + mlir_before_mppa_dump_file, + "-o", + mlir_after_mppa_dump_file, + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 + + def generate_c_host( + self, mlir_after_mppa_dump_file: str, c_host_dump_file: str + ) -> None: + cmd = self.cmd_mppa_translate + [ + "--mlir-to-c-host", + mlir_after_mppa_dump_file, + "-o", + c_host_dump_file, + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 + + def generate_c_accelerator( + self, mlir_after_mppa_dump_file: str, c_accelerator_dump_file: str + ) -> None: + cmd = self.cmd_mppa_translate + [ + "--mlir-to-c-accelerator", + mlir_after_mppa_dump_file, + "-o", + c_accelerator_dump_file, + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 + + def compile_c_accelerator( + self, c_accelerator_dump_file: str, obj_accelerator_dump_file: str + ) -> None: + cmd = self.cmd_kvx_cc + [ + "-O2", + "-fPIC", + f"-I{self._mlir_mppa_path}/include", + "-march=kv3-2", + "-DBUILD_ID=0", + "-fvect-cost-model=cheap", + "-fstack-limit-register=sr", + "-c", + c_accelerator_dump_file, + "-o", + obj_accelerator_dump_file, + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 + + def link_kvx_library( + self, obj_accelerator_dump_file: str, kvx_so_dump_file: str + ) -> None: + cmd = self.cmd_kvx_cc + [ + "-shared", + "-fPIC", + "-march=kv3-2", + "-Wl,-soname=libkvx.so", + obj_accelerator_dump_file, + "-o", + kvx_so_dump_file, + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 + + def compile_c_host( + self, c_host_dump_file: str, obj_host_dump_file: str, kvx_so_dump_file: str + ) -> None: + cmd = self.cmd_host_cc + [ + "-O2", + "-fPIC", + "-Wall", + "-Wextra", + "-I" + self._mlir_mppa_path + "/include", + "-I" + self._csw_path + "/include", + "-DTARGET_KV3_2", + '-DKERNEL_PATHNAME="' + kvx_so_dump_file + '"', + "-c", + c_host_dump_file, + "-o", + obj_host_dump_file, + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 + + def link_shared_library( + self, obj_host_dump_file: str, obj_accelerator_dump_file: str, so_dump_file: str + ) -> None: + cmd = self.cmd_host_cc + [ + "-shared", + "-fPIC", + "-O2", + obj_host_dump_file, + "-o", + so_dump_file, + "-Wl,-rpath,$ORIGIN/../lib", + "-L" + self._csw_path + "/lib", + "-lmppa_offload_host", + "-lmopd", + "-lmppa_rproc_host", + "-lpthread", + "-L" + self._mlir_mppa_path + "/_mlir_libs", + "-lmlir_c_runner_utils", + ] + exe_process = self._execute_command(cmd=cmd) + assert exe_process.returncode == 0 diff --git a/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py index b5cbef330..ba4ba5a2e 100644 --- a/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py +++ b/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py @@ -23,12 +23,12 @@ cc_bin, ) from xtc.utils.tools import get_cuda_prefix -from xtc.targets.gpu import GPUModule +from xtc.targets.accelerator.gpu import GPUModule import xtc.itf as itf from xtc.itf.graph import Graph from mlir.dialects import func -from mlir.ir import UnitAttr +from mlir.ir import UnitAttr, OpResult from mlir.passmanager import PassManager from .MlirTarget import MlirTarget @@ -323,6 +323,14 @@ def create_module( ) -> itf.comp.Module: return GPUModule(name, payload_name, file_name, file_type, graph, **kwargs) + @override + def custom_vectorize(self) -> bool: + return False + + @override + def apply_custom_vectorize(self, handle: OpResult) -> None: + return + @property def disassemble_option(self): if not self._config.to_disassemble: diff --git a/src/xtc/backends/mlir/MlirTarget/MlirTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirTarget.py index 886c49a1f..02a0f5eba 100644 --- a/src/xtc/backends/mlir/MlirTarget/MlirTarget.py +++ b/src/xtc/backends/mlir/MlirTarget/MlirTarget.py @@ -11,6 +11,8 @@ import xtc.itf as itf from xtc.itf.graph import Graph +from mlir.ir import OpResult + __all__ = ["MlirTarget"] @@ -65,3 +67,17 @@ def create_module( for the target. """ ... + + @abstractmethod + def custom_vectorize(self) -> bool: + """ + Return True if the target needs to apply custom vectorization. + """ + ... + + @abstractmethod + def apply_custom_vectorize(self, handle: OpResult) -> None: + """ + Apply the custom vectorization for the target. + """ + ... diff --git a/src/xtc/backends/mlir/MlirTarget/__init__.py b/src/xtc/backends/mlir/MlirTarget/__init__.py index f61f178dd..0ccd44d7b 100644 --- a/src/xtc/backends/mlir/MlirTarget/__init__.py +++ b/src/xtc/backends/mlir/MlirTarget/__init__.py @@ -20,6 +20,10 @@ def get_target_from_name(name: str) -> type[MlirTarget]: from .MlirNVGPUTarget import MlirNVGPUTarget return MlirNVGPUTarget + elif name == "mppa": + from .MlirMppaTarget import MlirMppaTarget + + return MlirMppaTarget else: raise NameError(f"'{name}' is not a known target") diff --git a/src/xtc/csrcs/runtimes/gpu/perf_event_gpu.cpp b/src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.cpp similarity index 100% rename from src/xtc/csrcs/runtimes/gpu/perf_event_gpu.cpp rename to src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.cpp diff --git a/src/xtc/csrcs/runtimes/gpu/perf_event_gpu.h b/src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.h similarity index 100% rename from src/xtc/csrcs/runtimes/gpu/perf_event_gpu.h rename to src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.h diff --git a/src/xtc/csrcs/runtimes/accelerator/mppa/host.c b/src/xtc/csrcs/runtimes/accelerator/mppa/host.c new file mode 100644 index 000000000..1962bf81c --- /dev/null +++ b/src/xtc/csrcs/runtimes/accelerator/mppa/host.c @@ -0,0 +1,130 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2024-2026 The XTC Project Authors + */ +#include "host_structures.h" +#include "mlir_host_header.h" +#include "mppa_management_host.h" + +#include + +#include +#include +#include +#include + +void *mppa_common_structures = NULL; +mppa_offload_accelerator_t *mppa_accelerator = NULL; +mppa_offload_sysqueue_t *master_sysqueue = NULL; +size_t mppa_alloc_alignment = 64; + +bool mppa_init_device() +{ + if (mppa_common_structures == NULL) { + mppa_common_structures = mppa_pre_init(); + // Get ctx + mppa_offload_ctx_ptr = (mppa_offload_host_context_t*) ((void**)mppa_common_structures)[0]; + assert(mppa_offload_ctx_ptr != NULL); + // Get accelerator + mppa_accelerator = mppa_offload_get_accelerator(mppa_offload_ctx_ptr, 0); + assert(mppa_accelerator != NULL); + // Get main sysqueue + master_sysqueue = mppa_offload_get_sysqueue(mppa_accelerator, 0); + assert(master_sysqueue != NULL); + } + return true; +} + +bool mppa_deinit_device() +{ + mppa_de_init(); + mppa_common_structures = NULL; + return true; +} + +void* mppa_get_common_structures() +{ + return mppa_common_structures; +} + +void* mppa_create_memory_handle() +{ + void *handle = malloc(sizeof(mppa_buffer_t)); + assert(handle != NULL); + return handle; +} + +bool mppa_destroy_memory_handle(void *handle) +{ + free(handle); + return true; +} + +void mppa_set_alloc_alignment(size_t alignment) +{ + mppa_alloc_alignment = alignment; +} + +bool mppa_memory_allocate(void *handle, size_t size) +{ + assert(handle != NULL); + assert(master_sysqueue != NULL); + mppa_buffer_t *buffer = (mppa_buffer_t *)handle; + if (mppa_offload_alloc(master_sysqueue, size, mppa_alloc_alignment, MPPA_OFFLOAD_ALLOC_DDR, &(buffer->voffset), &(buffer->offset)) != 0) { + assert(0 && "Fail to alloc buffer\n"); + return false; + } + return true; +} + +bool mppa_memory_free(void *handle) +{ + assert(handle != NULL); + assert(master_sysqueue != NULL); + mppa_buffer_t *buffer = (mppa_buffer_t *)handle; + if (mppa_offload_free(master_sysqueue, MPPA_OFFLOAD_ALLOC_DDR, buffer->voffset) != 0) { + assert(0 && "Fail to dealloc buffer\n"); + return false; + } + return true; +} + +bool mppa_memory_copy_to(void *handle, void *src, size_t size) +{ + assert(handle != NULL); + assert(mppa_accelerator != NULL); + mppa_buffer_t *buffer = (mppa_buffer_t *)handle; + if (mppa_offload_write(mppa_accelerator, src, buffer->offset, size, NULL) != 0) { + assert(0 && "Failed write buffer\n"); + return false; + } + return true; +} + +bool mppa_memory_copy_from(void *handle, void *dst, size_t size) +{ + assert(handle != NULL); + assert(mppa_accelerator != NULL); + mppa_buffer_t *buffer = (mppa_buffer_t *)handle; + if (mppa_offload_read(mppa_accelerator, dst, buffer->offset, size, NULL) != 0) { + assert(0 && "Failed read buffer\n"); + return false; + } + return true; +} + +bool mppa_memory_fill_zero(void *handle, size_t size) +{ + void* tmp = calloc(size, 1); + assert(tmp != NULL); + bool res = mppa_memory_copy_to(handle, tmp, size); + free(tmp); + return res; +} + +void* mppa_memory_data_pointer(void *handle) +{ + assert(handle != NULL); + mppa_buffer_t *buffer = (mppa_buffer_t *)handle; + return (void*)buffer->voffset; +} diff --git a/src/xtc/graphs/xtc/data.py b/src/xtc/graphs/xtc/data.py index 2451de8a9..f26c35168 100644 --- a/src/xtc/graphs/xtc/data.py +++ b/src/xtc/graphs/xtc/data.py @@ -19,6 +19,8 @@ ConstantDataType, ) +from xtc.itf.runtime.accelerator import AcceleratorDevice + __all__ = [ "XTCTensorType", @@ -27,9 +29,15 @@ class XTCTensorType(TensorType): - def __init__(self, shape: ShapeType = None, dtype: DataType = None): + def __init__( + self, + shape: ShapeType = None, + dtype: DataType = None, + device: AcceleratorDevice | None = None, + ): self._shape = shape self._dtype = dtype + self._device = device @property @override @@ -41,6 +49,11 @@ def shape(self) -> ShapeType: def dtype(self) -> DataType: return self._dtype + @property + @override + def device(self) -> AcceleratorDevice | None: + return self._device + @property @override def ndim(self) -> int: diff --git a/src/xtc/graphs/xtc/expr.py b/src/xtc/graphs/xtc/expr.py index e4ce0accd..c76a861ef 100644 --- a/src/xtc/graphs/xtc/expr.py +++ b/src/xtc/graphs/xtc/expr.py @@ -24,6 +24,8 @@ XTCOperTranspose, ) +from xtc.itf.runtime.accelerator import AcceleratorDevice + __all__ = [ "XTCExpr", "XTCValueExpr", @@ -116,12 +118,13 @@ def __init__( tensor: XTCTensorType | XTCTensor | ShapeType | None = None, shape: ShapeType | DataType = None, dtype: DataType = None, + device: AcceleratorDevice | None = None, ) -> None: super().__init__() if tensor is None: assert shape is None or isinstance(shape, tuple) assert dtype is None or isinstance(dtype, str) - type = XTCTensorType(shape=shape, dtype=dtype) + type = XTCTensorType(shape=shape, dtype=dtype, device=device) value = XTCTensor(type=type) elif isinstance(tensor, XTCTensorType): assert shape is None and dtype is None @@ -133,11 +136,12 @@ def __init__( if shape is not None: assert isinstance(shape, str) assert dtype is None - type = XTCTensorType(shape=tensor, dtype=shape) + type = XTCTensorType(shape=tensor, dtype=shape, device=device) else: - type = XTCTensorType(shape=tensor, dtype=dtype) + type = XTCTensorType(shape=tensor, dtype=dtype, device=device) value = XTCTensor(type=type) self._value = value + self._device = device self._op = XTCOperTensor() @property diff --git a/src/xtc/graphs/xtc/operators.py b/src/xtc/graphs/xtc/operators.py index 1f65bd17e..a8addc0eb 100644 --- a/src/xtc/graphs/xtc/operators.py +++ b/src/xtc/graphs/xtc/operators.py @@ -12,6 +12,7 @@ from xtc.itf.operator import Operator from xtc.itf.data import Tensor, TensorType +from xtc.itf.runtime.accelerator import AcceleratorDevice from .data import XTCTensor, XTCTensorType from .operation import XTCOperation @@ -30,8 +31,14 @@ class XTCOperator(Operator): - def __init__(self, name: str, **attrs: XTCOperatorAttr) -> None: + def __init__( + self, + name: str, + device: AcceleratorDevice | None = None, + **attrs: XTCOperatorAttr, + ) -> None: self._name = name + self._device = device self._attrs = NS(**attrs) @property @@ -43,11 +50,27 @@ def name(self) -> str: def attrs(self) -> XTCOperatorAttrs: return self._attrs + @property + def device(self) -> AcceleratorDevice | None: + return self._device + @override def forward_types( self, inputs_types: Sequence[TensorType] ) -> Sequence[XTCTensorType]: - return [cast(XTCTensorType, inp_type) for inp_type in inputs_types] + if self.device is None: + return [cast(XTCTensorType, inp_type) for inp_type in inputs_types] + res_types = [] + for inp_type in inputs_types: + inp_tensor_type = cast(XTCTensorType, inp_type) + res_types.append( + XTCTensorType( + shape=inp_tensor_type.shape, + dtype=inp_tensor_type.dtype, + device=inp_tensor_type.device, + ) + ) + return res_types @override def forward(self, inputs: Sequence[Tensor]) -> Sequence[XTCTensor]: @@ -88,8 +111,8 @@ def _get_operation( class XTCOperTensor(XTCOperator): - def __init__(self) -> None: - super().__init__("tensor") + def __init__(self, **attrs: XTCOperatorAttr) -> None: + super().__init__("tensor", **attrs) @override def get_operation( @@ -112,8 +135,8 @@ def get_operation( class XTCOperMatmul(XTCOperator): - def __init__(self) -> None: - super().__init__("matmul") + def __init__(self, **attrs: XTCOperatorAttr) -> None: + super().__init__("matmul", **attrs) @override def get_operation( @@ -155,8 +178,7 @@ def forward_types( ) return [ XTCTensorType( - shape=(i, j), - dtype=inputs_types[0].dtype, + shape=(i, j), dtype=inputs_types[0].dtype, device=self.device ), ] @@ -268,6 +290,7 @@ def forward_types( XTCTensorType( shape=tuple([*inputs_types[0].shape[:-3], oh, ow, f]), dtype=inputs_types[0].dtype, + device=self.device, ), ] @@ -299,8 +322,10 @@ class _OperPadImpl: def __init__(self, **attrs: XTCOperatorAttr) -> None: padding = attrs.get("padding", 0) constant_value = attrs.get("constant_value", 0) + device = attrs.get("device", None) self.padding = padding self.constant_value = constant_value + self.device = device def get_operation_variable( self, @@ -352,8 +377,7 @@ def forward_types( dims_types = [value + pad for value in dims_types] return [ XTCTensorType( - shape=tuple(dims_types), - dtype=inputs_types[0].dtype, + shape=tuple(dims_types), dtype=inputs_types[0].dtype, device=self.device ), ] @@ -410,8 +434,15 @@ def __init__(self, **attrs: XTCOperatorAttr) -> None: ) if isinstance(padding, dict): padding = {k: v for k, v in padding.items() if v != (0, 0)} - self.impl = _OperPadImpl(padding=padding, constant_value=constant_value) - super().__init__("pad", padding=padding, constant_value=constant_value) + super().__init__( + "pad", + padding=padding, + constant_value=constant_value, + device=attrs.get("device", None), + ) + self.impl = _OperPadImpl( + padding=padding, constant_value=constant_value, device=self.device + ) @override def get_operation( @@ -486,8 +517,15 @@ def __init__(self, **attrs: XTCOperatorAttr) -> None: assert isinstance(constant_value, (int, float)), ( f"constant_value need to be a number" ) - self.impl = _OperPadImpl(padding=padding, constant_value=constant_value) - super().__init__("pad2d", padding=padding, constant_value=constant_value) + super().__init__( + "pad2d", + padding=padding, + constant_value=constant_value, + device=attrs.get("device", None), + ) + self.impl = _OperPadImpl( + padding=padding, constant_value=constant_value, device=self.device + ) @override def get_operation( @@ -602,8 +640,7 @@ def forward_types( dims_types = [value - pad for value in dims_types] return [ XTCTensorType( - shape=tuple(dims_types), - dtype=inputs_types[0].dtype, + shape=tuple(dims_types), dtype=inputs_types[0].dtype, device=self.device ), ] @@ -668,8 +705,7 @@ def forward_types( out_shape = tuple([x if x != -1 else size // fixed_size for x in self._shape]) return [ XTCTensorType( - shape=out_shape, - dtype=inputs_types[0].dtype, + shape=out_shape, dtype=inputs_types[0].dtype, device=self.device ), ] @@ -701,8 +737,7 @@ def forward_types( out_shape = tuple([shape[n] for n in self.attrs.axes]) return [ XTCTensorType( - shape=out_shape, - dtype=inputs_types[0].dtype, + shape=out_shape, dtype=inputs_types[0].dtype, device=self.device ), ] diff --git a/src/xtc/itf/data/tensor.py b/src/xtc/itf/data/tensor.py index 9695a0fd8..823864142 100644 --- a/src/xtc/itf/data/tensor.py +++ b/src/xtc/itf/data/tensor.py @@ -7,6 +7,7 @@ from typing_extensions import override import numpy.typing +from xtc.itf.runtime.accelerator import AcceleratorDevice ShapeType: TypeAlias = tuple[int | str | None, ...] | None DataType: TypeAlias = str | None @@ -43,6 +44,16 @@ def dtype(self) -> DataType: """ ... + @property + @abstractmethod + def device(self) -> AcceleratorDevice | None: + """Returns the device of the tensor. + + Returns: + The device of the tensor + """ + ... + @property @abstractmethod def ndim(self) -> int: diff --git a/src/xtc/itf/runtime/accelerator.py b/src/xtc/itf/runtime/accelerator.py new file mode 100644 index 000000000..16fddf0d6 --- /dev/null +++ b/src/xtc/itf/runtime/accelerator.py @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from abc import ABC, abstractmethod +from typing import Any, Callable +import ctypes + +from xtc.itf.comp.module import Module +from xtc.itf.runtime.common import CommonRuntimeInterface + + +class AcceleratorDevice(CommonRuntimeInterface, ABC): + """Abstract interface for an accelerator device (such as GPU, MPPA, etc).""" + + @abstractmethod + def detect_accelerator(self) -> bool: + """Detect if the accelerator device is available. + + Returns: + A boolean representing if the accelerator device is available. + """ + ... + + @abstractmethod + def init_device(self) -> None: + """Initialize the accelerator device. + + This method is called to initialize the accelerator device. + """ + ... + + @abstractmethod + def deinit_device(self) -> None: + """Deinitialize the accelerator device. + + This method is called to deinitialize the accelerator device. + """ + ... + + @abstractmethod + def load_module(self, module: Module) -> None: + """Load a module on the accelerator device. + + Args: + module (AcceleratorModule): The module to load. + """ + ... + + @abstractmethod + def get_module_function(self, module: Module, function_name: str) -> Callable: + """Get a function from a module on the accelerator device. + + Args: + module (AcceleratorModule): The module to get the function from. + function_name (str): The name of the function to get. + """ + ... + + @abstractmethod + def unload_module(self, module: Module) -> None: + """Unload a module from the accelerator device. + + Args: + module (AcceleratorModule): The module to unload. + """ + ... + + @abstractmethod + def memory_allocate(self, size_bytes: int) -> Any: + """Allocate memory on the accelerator device. + + Args: + size_bytes (int): The size in bytes to allocate. + + Returns: + A handle or reference to the allocated memory. + """ + ... + + @abstractmethod + def memory_free(self, handle: Any) -> None: + """Free memory on the accelerator device. + + Args: + handle (Any): The handle to the memory to free. + """ + ... + + @abstractmethod + def memory_copy_to( + self, acc_handle: Any, src: ctypes.c_void_p, size_bytes: int + ) -> None: + """Copy memory from the host to the accelerator device. + + Args: + acc_handle (Any): The handle to the memory to copy to. + src (ctypes.c_void_p): The source data pointer. + size_bytes (int): The size in bytes to copy. + """ + ... + + @abstractmethod + def memory_copy_from( + self, acc_handle: Any, dst: ctypes.c_void_p, size_bytes: int + ) -> None: + """Copy memory from the accelerator device to the host. + + Args: + acc_handle (Any): The handle to the memory to copy from. + dst (ctypes.c_void_p): The destination data pointer. + size_bytes (int): The size in bytes to copy. + """ + ... + + @abstractmethod + def memory_fill_zero(self, acc_handle: Any, size_bytes: int) -> None: + """Fill memory on the accelerator device with zeros. + + Args: + acc_handle (Any): The handle to the memory to fill with zeros. + size_bytes (int): The size in bytes to fill with zeros. + """ + ... + + @abstractmethod + def memory_data_pointer(self, acc_handle: Any) -> ctypes.c_void_p: + """Get the data pointer of the memory on the accelerator device. + + Args: + acc_handle (Any): The handle to the memory to get the data pointer of. + """ + ... + + # TODO: describe hardware architecture + # TODO: profiling and traces diff --git a/src/xtc/itf/runtime/common.py b/src/xtc/itf/runtime/common.py new file mode 100644 index 000000000..dd8a60d6e --- /dev/null +++ b/src/xtc/itf/runtime/common.py @@ -0,0 +1,208 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from abc import ABC, abstractmethod +from typing import Any + +from xtc.runtimes.types.dlpack import DLDataType, DLDevice +from xtc.utils.cfunc import CFunc + + +class CommonRuntimeInterface(ABC): + """Abstract interface for a common runtime interface.""" + + @abstractmethod + def target_name(self) -> str: + """Get the name of the target. + + Returns: + A string representing the name of the target. + """ + ... + + @abstractmethod + def device_name(self) -> str: + """Get the name of the device. + + Returns: + A string representing the name of the device. + """ + ... + + @abstractmethod + def device_arch(self) -> str: + """Get the architecture of the device. + + Returns: + A string representing the architecture of the device. + """ + ... + + @abstractmethod + def device_id(self) -> int: + """Get the ID of the device. + + Returns: + An integer representing the ID of the device. + """ + ... + + @abstractmethod + def evaluate( + self, + results: Any, + repeat: int, + number: int, + nargs: int, + cfunc: CFunc, + args: Any, + ) -> None: + """Evaluate a function with timing measurements. + + Args: + results: Pointer to array of doubles to store timing results. + repeat: Number of times to repeat the measurement. + number: Number of function calls per repeat. + nargs: Number of arguments passed to the function. + cfunc: Function pointer to evaluate. + args: Pointer to array of void pointers containing function arguments. + """ + ... + + @abstractmethod + def evaluate_perf( + self, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args_tuples: list[Any], + ) -> list[float]: + """Evaluate a function with performance counter measurements. + + Args: + pmu_events: List of performance events to measure. + repeat: Number of times to repeat the measurement. + number: Number of function calls per repeat. + min_repeat_ms: Minimum time in milliseconds for each repeat. + cfunc: Function pointer to evaluate. + args_tuples: List of argument tuples. + """ + ... + + @abstractmethod + def evaluate_packed( + self, + results: Any, + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + """Evaluate a packed function with timing measurements. + + Args: + results: Pointer to array of doubles to store timing results. + repeat: Number of times to repeat the measurement. + number: Number of function calls per repeat. + min_repeat_ms: Minimum time in milliseconds for each repeat. + cfunc: Packed function pointer to evaluate. + args: Pointer to array of packed arguments. + codes: Pointer to array of integers containing argument type codes. + nargs: Number of arguments. + """ + ... + + @abstractmethod + def evaluate_packed_perf( + self, + results: Any, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + """Evaluate a packed function with performance counter measurements. + + Args: + results: Pointer to array of doubles to store performance results. + pmu_events: List of performance events to measure. + repeat: Number of times to repeat the measurement. + number: Number of function calls per repeat. + min_repeat_ms: Minimum time in milliseconds for each repeat. + cfunc: Packed function pointer to evaluate. + args: Pointer to array of packed arguments. + codes: Pointer to array of integers containing argument type codes. + nargs: Number of arguments. + """ + ... + + @abstractmethod + def cndarray_new( + self, + ndim: int, + shape: Any, + dtype: DLDataType, + device: DLDevice, + ) -> Any: + """Create a new CNDArray. + + Args: + ndim: Number of dimensions. + shape: Pointer to array of int64 containing shape dimensions. + dtype: Data type descriptor. + device: Device descriptor. + + Returns: + Pointer to the created CNDArray, or None on failure. + """ + ... + + @abstractmethod + def cndarray_del(self, handle: Any) -> None: + """Delete a CNDArray. + + Args: + handle: Pointer to the CNDArray to delete. + """ + ... + + @abstractmethod + def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None: + """Copy data from a data handle into a CNDArray. + + Args: + handle: Pointer to the destination CNDArray. + data_handle: Pointer to the source data. + """ + ... + + @abstractmethod + def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None: + """Copy data from a CNDArray to a data handle. + + Args: + handle: Pointer to the source CNDArray. + data_handle: Pointer to the destination data. + """ + ... + + @abstractmethod + def evaluate_flops(self, dtype_name: str | bytes) -> float: + """Evaluate the peak floating-point operations per second for a given data type. + + Args: + dtype_name: Data type name as string or bytes (e.g., "float32"). + + Returns: + Peak FLOPS as a double, or 0.0 if the data type is not supported. + """ + ... diff --git a/src/xtc/itf/runtime/embedded.py b/src/xtc/itf/runtime/embedded.py new file mode 100644 index 000000000..b97a9339e --- /dev/null +++ b/src/xtc/itf/runtime/embedded.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from abc import ABC, abstractmethod + +from xtc.itf.runtime.common import CommonRuntimeInterface + + +class EmbeddedDevice(CommonRuntimeInterface, ABC): + """Abstract interface for an embedded device.""" + + @abstractmethod + def flash(self, image_path: str) -> None: + """Flash a binary image to the device. + + Args: + image_path (str): Path to the binary image to flash. + """ + ... + + # TODO diff --git a/src/xtc/runtimes/accelerator/gpu/GPUDevice.py b/src/xtc/runtimes/accelerator/gpu/GPUDevice.py new file mode 100644 index 000000000..5afaa7f07 --- /dev/null +++ b/src/xtc/runtimes/accelerator/gpu/GPUDevice.py @@ -0,0 +1,355 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +import logging +import ctypes +import logging +import ctypes +from pathlib import Path +from typing import Any, Callable +from typing_extensions import override + +from xtc.itf.runtime.accelerator import AcceleratorDevice +from xtc.itf.comp.module import Module +from xtc.utils.cfunc import CFunc, _str_list_to_c, _c_ascii_str + +from ...host.runtime import resolve_runtime, RuntimeType, runtime_funcs + +__all__ = ["GPUDevice"] + +logger = logging.getLogger(__name__) + +# Can be set to True for RUNTIME_DEBUG +RUNTIME_DEBUG = False + +from xtc.runtimes.types.dlpack import DLDevice, DLDataType + +from xtc.utils.loader import LibLoader +from xtc.utils.tools import get_mlir_prefix +from xtc.utils.ext_tools import cuda_runtime_lib + + +class GPUDevice(AcceleratorDevice): + """A class for GPU device""" + + # This is a singleton class; only one instance of GPUDevice will ever be created. + _instance = None + + def __new__(cls, *args: Any, **kwargs: Any) -> "GPUDevice": + if cls._instance is None: + cls._instance = super(GPUDevice, cls).__new__(cls) + cls._instance.__init_once__(*args) + return cls._instance + + def __init__(self): + # TODO check installation of cuda + pass + + def __init_once__(self): + self._mlir_runtime_lib = LibLoader( + f"{get_mlir_prefix()}/lib/{cuda_runtime_lib}" + ) + self.loaded_kernels: dict[Module, LibLoader] = {} + create_stream_func_name = "mgpuStreamCreate" + create_stream_func = getattr( + self._mlir_runtime_lib.lib, create_stream_func_name + ) + assert create_stream_func is not None, ( + f"Cannot find symbol {create_stream_func_name} in lib {self._mlir_runtime_lib.lib}" + ) + create_stream_func.argtypes = [] + create_stream_func.restype = ctypes.c_voidp + self._custream = create_stream_func() + + def __get_runtime_func(self, name: str) -> Callable: + if name in runtime_funcs: + entries = resolve_runtime(RuntimeType.GPU) + assert entries is not None + return entries[name] + raise AttributeError(f"undefined runtime function: {name}") + + def __del__(self): + remaining_modules = list(self.loaded_kernels.keys()) + for module in remaining_modules: + self.unload_module(module) + self._mlir_runtime_lib.close() + self._instance = None + + @override + def detect_accelerator(self) -> bool: + raise NotImplementedError("GPUDevice.detect_accelerator is not implemented") + + @override + def target_name(self) -> str: + return "nvgpu" + + @override + def device_name(self) -> str: + return "nvgpu" + + @override + def device_arch(self) -> str: + return "cuda" + + @override + def device_id(self) -> int: + return 0 # TODO: Handle multiple GPUs + + @override + def init_device(self) -> None: + # Not necessary for now + pass + + @override + def deinit_device(self) -> None: + # Not necessary for now + pass + + @override + def load_module(self, module: Module) -> None: + libloader = LibLoader(str(Path(module.file_name).absolute())) + self.loaded_kernels[module] = libloader + + @override + def get_module_function(self, module: Module, function_name: str) -> Callable: + if module not in self.loaded_kernels.keys(): + raise Exception("Kernel is not loaded") + func = getattr(self.loaded_kernels[module].lib, function_name) + assert func is not None, ( + f"Cannot find symbol {function_name} in lib {module.file_name}" + ) + return func + + @override + def unload_module(self, module: Module) -> None: + if module not in self.loaded_kernels.keys(): + raise Exception("Kernel is not loaded") + self.loaded_kernels[module].close() + self.loaded_kernels.pop(module) + + @override + def memory_allocate(self, size_bytes: int) -> Any: + func_name = "mgpuMemAlloc" + func = getattr(self._mlir_runtime_lib.lib, func_name) + assert func is not None, ( + f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}" + ) + func.argtypes = [ctypes.c_uint64, ctypes.c_voidp, ctypes.c_bool] + func.restype = ctypes.c_voidp + return func(size_bytes, self._custream, True) + + @override + def memory_free(self, handle: Any) -> None: + func_name = "mgpuMemFree" + func = getattr(self._mlir_runtime_lib.lib, func_name) + assert func is not None, ( + f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}" + ) + func.argtypes = [ctypes.c_voidp, ctypes.c_voidp] + func.restype = None + func(handle, self._custream) + + @override + def memory_copy_to( + self, acc_handle: Any, src: ctypes.c_void_p, size_bytes: int + ) -> None: + # Copy memory to accelerator device + func_name = "mgpuMemcpy" + func = getattr(self._mlir_runtime_lib.lib, func_name) + assert func is not None, ( + f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}" + ) + func.argtypes = [ + ctypes.c_voidp, + ctypes.c_voidp, + ctypes.c_uint64, + ctypes.c_voidp, + ] + func.restype = None + func(acc_handle, src, size_bytes, self._custream) + # Synchronize stream + sync_stream_func_name = "mgpuStreamSynchronize" + sync_stream_func = getattr(self._mlir_runtime_lib.lib, sync_stream_func_name) + assert sync_stream_func is not None, ( + f"Cannot find symbol {sync_stream_func_name} in lib {self._mlir_runtime_lib.lib}" + ) + sync_stream_func.argtypes = [ctypes.c_voidp] + sync_stream_func.restype = None + sync_stream_func(self._custream) + + @override + def memory_copy_from( + self, acc_handle: Any, dst: ctypes.c_void_p, size_bytes: int + ) -> None: + # Copy memory from accelerator device to host + func_name = "mgpuMemcpy" + func = getattr(self._mlir_runtime_lib.lib, func_name) + assert func is not None, ( + f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}" + ) + func.argtypes = [ + ctypes.c_voidp, + ctypes.c_voidp, + ctypes.c_uint64, + ctypes.c_voidp, + ] + func.restype = None + func(dst, acc_handle, size_bytes, self._custream) + # Synchronize stream + sync_stream_func_name = "mgpuStreamSynchronize" + sync_stream_func = getattr(self._mlir_runtime_lib.lib, sync_stream_func_name) + assert sync_stream_func is not None, ( + f"Cannot find symbol {sync_stream_func_name} in lib {self._mlir_runtime_lib.lib}" + ) + sync_stream_func.argtypes = [ctypes.c_voidp] + sync_stream_func.restype = None + sync_stream_func(self._custream) + + @override + def memory_fill_zero(self, acc_handle: Any, size_bytes: int) -> None: + raise NotImplementedError("memory_fill_zero is not implemented for GPU device") + + @override + def memory_data_pointer(self, acc_handle: Any) -> ctypes.c_void_p: + return ctypes.cast(acc_handle, ctypes.c_void_p) + + @override + def evaluate( + self, + results: Any, + repeat: int, + number: int, + nargs: int, + cfunc: CFunc, + args: Any, + ) -> None: + self.__get_runtime_func("evaluate")( + ctypes.cast(results, ctypes.POINTER(ctypes.c_double)), + ctypes.c_int(repeat), + ctypes.c_int(number), + ctypes.c_int(nargs), + ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), + ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)), + ) + + @override + def evaluate_perf( + self, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args_tuples: list[Any], + ) -> list[float]: + args_array = (ctypes.c_voidp * len(args_tuples))( + *[arg[0] for arg in args_tuples] + ) + values_num = 1 + if len(pmu_events) > 0: + values_num = len(pmu_events) + # FIXME check if the PMU events are supported by the target + results_array = (ctypes.c_double * (repeat * values_num))() + self.__get_runtime_func("evaluate_perf")( + ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)), + ctypes.c_int(len(pmu_events)), + _str_list_to_c(pmu_events), + ctypes.c_int(repeat), + ctypes.c_int(number), + ctypes.c_int(min_repeat_ms), + ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), + ctypes.cast(args_array, ctypes.POINTER(ctypes.c_voidp)), + ctypes.c_int(len(args_tuples)), + ) + return [float(x) for x in results_array] + + @override + def evaluate_packed( + self, + results: Any, + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + raise NotImplementedError("evaluate_packed is not implemented for GPU device") + + @override + def evaluate_packed_perf( + self, + results: Any, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + raise NotImplementedError( + "evaluate_packed_perf is not implemented for GPU device" + ) + + @override + def cndarray_new( + self, + ndim: int, + shape: Any, + dtype: DLDataType, + device: DLDevice, + ) -> Any: + # Convert shape if it's a list/tuple to ctypes array + if isinstance(shape, (list, tuple)): + shape_array = (ctypes.c_int64 * len(shape))(*shape) + shape = ctypes.cast(shape_array, ctypes.POINTER(ctypes.c_int64)) + return self.__get_runtime_func("cndarray_new")( + ctypes.c_int32(ndim), + shape, + dtype, + device, + ) + + @override + def cndarray_del(self, handle: Any) -> None: + self.__get_runtime_func("cndarray_del")(handle) + + @override + def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None: + self.__get_runtime_func("cndarray_copy_from_data")(handle, data_handle) + + @override + def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None: + self.__get_runtime_func("cndarray_copy_to_data")(handle, data_handle) + + @override + def evaluate_flops(self, dtype_name: str | bytes) -> float: + return float( + self.__get_runtime_func("evaluate_flops")( + _c_ascii_str.from_param(dtype_name) + ) + ) + + # Extra methods + def _register_buffer(self, handle: Any, size_bytes: int) -> None: + nb_bytes_c = ctypes.c_int64(size_bytes) + buffer_ptr = ctypes.cast(handle, ctypes.c_void_p) + func_name = "mgpuMemHostRegister" + func = getattr(self._mlir_runtime_lib.lib, func_name) + assert func is not None, ( + f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}" + ) + func(buffer_ptr, nb_bytes_c) + + def _unregister_buffer(self, handle: Any) -> None: + buffer_ptr = ctypes.cast(handle, ctypes.c_void_p) + func_name = "mgpuMemHostUnregister" + func = getattr(self._mlir_runtime_lib.lib, func_name) + assert func is not None, ( + f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}" + ) + func(buffer_ptr) diff --git a/src/xtc/runtimes/accelerator/gpu/__init__.py b/src/xtc/runtimes/accelerator/gpu/__init__.py new file mode 100644 index 000000000..adfff5caa --- /dev/null +++ b/src/xtc/runtimes/accelerator/gpu/__init__.py @@ -0,0 +1,7 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from .GPUDevice import GPUDevice + +__all__ = ["GPUDevice"] diff --git a/src/xtc/runtimes/accelerator/mppa/MppaDevice.py b/src/xtc/runtimes/accelerator/mppa/MppaDevice.py new file mode 100644 index 000000000..bbb982935 --- /dev/null +++ b/src/xtc/runtimes/accelerator/mppa/MppaDevice.py @@ -0,0 +1,576 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +import ctypes +import subprocess +import logging +import os +import subprocess +import ctypes +import sys +from pathlib import Path +from typing import Any, Callable +from typing_extensions import override + +from xtc.itf.runtime.accelerator import AcceleratorDevice +from xtc.itf.comp.module import Module +from xtc.utils.cfunc import CFunc + +__all__ = ["MppaDevice"] + +logger = logging.getLogger(__name__) + +# Can be set to True for RUNTIME_DEBUG +RUNTIME_DEBUG = False + +from xtc.runtimes.types.dlpack import DLDevice, DLDataType + +from xtc.utils.ext_tools import cc_bin + +from .config import MppaConfig +from xtc.utils.loader import LibLoader +from xtc.runtimes.host.HostRuntime import HostRuntime + +MAX_NB_LOADED_KERNELS = 10 + + +def _get_csrcs_dir(): + return Path(__file__).parents[3] / "csrcs" / "runtimes" / "accelerator" / "mppa" + + +def _execute_command( + cmd: list[str], + input_pipe: str | None = None, + pipe_stdoutput: bool = True, + debug: bool = False, +) -> subprocess.CompletedProcess: + pretty_cmd = "| " if input_pipe else "" + pretty_cmd += " ".join(cmd) + if debug: + print(f"> exec: {pretty_cmd}", file=sys.stderr) + + if input_pipe and pipe_stdoutput: + result = subprocess.run( + cmd, input=input_pipe, stdout=subprocess.PIPE, text=True + ) + elif input_pipe and not pipe_stdoutput: + result = subprocess.run(cmd, input=input_pipe, text=True) + elif not input_pipe and pipe_stdoutput: + result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + else: + result = subprocess.run(cmd, text=True) + return result + + +def _compile_kvx_object(device: "MppaDevice", src_file: str, obj_file: str): + cmd_kvx_cc = [f"{device._csw_path}/bin/kvx-cos-gcc"] + cmd = cmd_kvx_cc + [ + "-O2", + "-fPIC", + f"-I{device._mlir_mppa_path}/include", + "-march=kv3-2", + "-c", + src_file, + "-o", + obj_file, + ] + return _execute_command(cmd=cmd, debug=device.config.mlir_config.debug) + + +def _compile_host_object(device: "MppaDevice", src_file: str, obj_file: str): + cmd_host_cc = [cc_bin] + cmd = cmd_host_cc + [ + "-O2", + "-fPIC", + "-Wall", + "-Wextra", + f"-I{device._mlir_mppa_path}/include", + f"-I{device._csw_path}/include", + f"-I{_get_csrcs_dir()}", + "-DNB_CC=5", + "-DTARGET_KV3_2", + '-DKERNEL_PATHNAME="' + device.config.work_dir + "/mppa_runtime_acc.so" + '"', + "-c", + src_file, + "-o", + obj_file, + ] + return _execute_command(cmd=cmd, debug=device.config.mlir_config.debug) + + +def _compile_runtime_lib(device: "MppaDevice") -> LibLoader: + kvx_src_files = [ + device._mlir_mppa_path + "/src/runtime/mppa_management_accelerator.c", + ] + host_src_files = [ + device._mlir_mppa_path + "/src/runtime/mppa_management_host.c", + str(_get_csrcs_dir() / "host.c"), + ] + + # Compile KVX objects + kvx_obj_files = [ + f"{device.config.work_dir}/{Path(file).stem}.o" for file in kvx_src_files + ] + for src_file, obj_file in zip(kvx_src_files, kvx_obj_files): + _compile_kvx_object(device, src_file, obj_file) + # Link KVX objects + cmd_kvx_cc = [f"{device._csw_path}/bin/kvx-cos-gcc"] + cmd_kvx_link = cmd_kvx_cc + [ + "-shared", + "-fPIC", + "-march=kv3-2", + "-Wl,-soname=mppa_runtime_acc.so", + *kvx_obj_files, + "-o", + device.config.work_dir + "/mppa_runtime_acc.so", + ] + exe_process = _execute_command( + cmd=cmd_kvx_link, debug=device.config.mlir_config.debug + ) + assert exe_process.returncode == 0 + + # Compile host objects + host_obj_files = [ + f"{device.config.work_dir}/{Path(file).stem}.o" for file in host_src_files + ] + for src_file, obj_file in zip(host_src_files, host_obj_files): + _compile_host_object(device, src_file, obj_file) + # Link host objects + cmd_host_cc = [cc_bin] + cmd_host_link = cmd_host_cc + [ + "-shared", + "-fPIC", + "-O2", + *host_obj_files, + "-o", + device.config.work_dir + "/mppa_runtime_host.so", + "-Wl,-rpath,$ORIGIN/../lib", + "-L" + device._csw_path + "/lib", + "-lmppa_offload_host", + "-lmopd", + "-lmppa_rproc_host", + "-lpthread", + ] + exe_process = _execute_command( + cmd=cmd_host_link, debug=device.config.mlir_config.debug + ) + assert exe_process.returncode == 0 + + return LibLoader(device.config.work_dir + "/mppa_runtime_host.so") + + +class MppaDevice(AcceleratorDevice): + """A class for Mppa device""" + + # This is a singleton class; only one instance of MppaDevice will ever be created. + _instance = None + + def __new__(cls, *args: Any, **kwargs: Any) -> "MppaDevice": + if cls._instance is None: + cls._instance = super(MppaDevice, cls).__new__(cls) + cls._instance.__init_once__(*args) + return cls._instance + + def __init__(self, config: MppaConfig | None = None): + try: + import mlir_mppa + except ImportError: + raise ImportError( + "mlir_mppa is not installed but is required for MPPA target" + ) + try: + self._csw_path = os.environ["KALRAY_TOOLCHAIN_DIR"] + except KeyError: + raise KeyError( + "Please source the Kalray Accesscore Toolchain: https://www.kalrayinc.com/products/software/" + ) + self._mlir_mppa_path = mlir_mppa.__path__[0] + if (config is not None) and (config != self.config): + raise ValueError( + "MppaDevice already initialized with a different configuration" + ) + + def __init_once__(self, config: MppaConfig | None = None): + if config is None: + config = MppaConfig() + self.config: MppaConfig = config + self.lib_loader: LibLoader | None = None + self.mppa_initialized: bool = False + self.loaded_kernels: dict[Module, LibLoader] = {} + self.calls_counter: int = 0 + self.need_rebuild: bool = False + + def __build_runtime_lib(self) -> LibLoader: + os.system("mkdir -p " + self.config.work_dir) + build_subdir = self.config.work_dir + "/mppa_management" + os.system("mkdir -p " + build_subdir) + if self.config.platform in ["iss", "qemu"]: + os.environ["OMP_MPPA_FIRMWARE_NAME"] = self.config.firmware + os.environ["MPPA_RPROC_PLATFORM_MODE"] = "sim" + os.environ["MPPA_RPROC_SIM_PATH"] = self.config.work_dir + "/mymppa" + if self.need_rebuild: + os.system("rm -r " + build_subdir + "/*") + self.need_rebuild = False + return _compile_runtime_lib(self) + + def _insert_mock_tracepoints(self): + assert self.lib_loader is not None + kernel_fn = getattr(self.lib_loader.lib, "mppa_insert_mock_tracepoints") + kernel_fn() + + def __del__(self): + if self.mppa_initialized: + self.deinit_device() + if self.lib_loader is not None: + self.lib_loader.close() + self.lib_loader = None + self._instance = None + + @override + def detect_accelerator(self) -> bool: + o = subprocess.run( + ["kvx-board-diag", "--list-board"], capture_output=True, text=True + ) + if "No Available board" in o.stdout: + return False + return True + + @override + def target_name(self) -> str: + return "mppa" + + @override + def device_name(self) -> str: + return "k300" + + @override + def device_arch(self) -> str: + return "kv3-2" + + @override + def device_id(self) -> int: + return 0 # TODO: Allow multiple mppa per machine (e.g. TC4) + + @override + def init_device(self) -> None: + """Pre-Init Mppa-Offload, which takes around 3 secondes""" + if self.mppa_initialized: + return + if self.lib_loader is None: + self.lib_loader = self.__build_runtime_lib() + assert self.lib_loader is not None + if self.config.verbose: + print("(Mppa Pre-Init)") + os.environ["MLIR_MPPA_FIRMWARE_NAME"] = self.config.firmware + # prepare qemu/iss + if self.config.platform in ["iss", "qemu"]: + os.system("mkdir -p " + self.config.work_dir + "/mymppa") + os.environ["OMP_MPPA_FIRMWARE_NAME"] = self.config.firmware + os.environ["MPPA_RPROC_PLATFORM_MODE"] = "sim" + os.environ["MPPA_RPROC_SIM_PATH"] = self.config.work_dir + "/mymppa" + if self.config.platform == "iss": + if self.config.verbose: + print("(Launching ISS)") + subprocess.Popen( + "kvx-cluster --disable-cache --march=" + + self.config.arch + + " --no-load-elf --sim-server=SOCKET --mmap --mppa-wdir=" + + self.config.work_dir + + "/mymppa", + shell=True, + ) + elif self.config.platform == "qemu": + if self.config.verbose: + print("(Launching Qemu)") + subprocess.Popen( + "kvx-qemu-offload-bridge --arch " + + self.config.arch + + " --work-dir " + + self.config.work_dir + + "/mymppa", + shell=True, + ) + # set env variables for traces + if self.config.mppa_trace_enable: + if self.config.verbose: + print("(Using Mppa traces)") + if self.config.platform in ["iss", "qemu"]: + print( + "[Warning: Mppa traces are enabled, ISS/Qemu cannot handle them]" + ) + if self.config.mppa_trace_use_syscall: + os.environ["MPPA_ENVP"] = ( + "MPPA_TRACE_ENABLE_META=1 MPPA_TRACE_USE_SYSCALL=1" + ) + if self.config.verbose: + print( + "[Warning: Mppa traces are enabled using syscalls, please consider using hardware acquisition if the overhead is too high]" + ) + else: + os.environ["MPPA_ENVP"] = "MPPA_TRACE_ENABLE_META=1" + preinit_fn = getattr(self.lib_loader.lib, "mppa_init_device") + preinit_fn.restype = ctypes.c_bool + if not preinit_fn(): + raise Exception("Failed to pre-init Mppa-Offload") + self.mppa_initialized = True + + @override + def deinit_device(self) -> None: + """De-Init Mppa-Offload""" + assert self.lib_loader is not None + if not self.mppa_initialized: + return + remaining_modules = list(self.loaded_kernels.keys()) + for module in remaining_modules: + self.unload_module(module) + if self.config.verbose: + print("(Mppa De-Init)") + deinit_fn = getattr(self.lib_loader.lib, "mppa_deinit_device") + deinit_fn.restype = ctypes.c_bool + if not deinit_fn(): + raise Exception("Failed to de-init Mppa-Offload") + self.mppa_initialized = False + + @override + def load_module(self, module: Module) -> None: + """Add a new loaded kernel in the cache""" + if not self.mppa_initialized: + self.init_device() + assert self.lib_loader is not None + if len(self.loaded_kernels) == MAX_NB_LOADED_KERNELS: + if self.config.verbose: + print( + "(maximum number of loaded kernels exceeded, removing the last recently used)" + ) + assert False, "Maximum number of loaded kernels exceeded" + # FIXME + # Init if not already done + self.init_device() + # Add new kernel + if self.config.verbose: + print("(Loading kernel: " + module.name + ")") + libloader = LibLoader(str(Path(module.file_name).absolute())) + # Pass the context created during pre-init + get_mppa_common_structures_fn = getattr( + self.lib_loader.lib, "mppa_get_common_structures" + ) + get_mppa_common_structures_fn.restype = ctypes.c_void_p + mppa_common_structures = get_mppa_common_structures_fn() + set_mppa_common_structures_fn = getattr( + libloader.lib, "set_mppa_common_structures" + ) + set_mppa_common_structures_fn.argtypes = [ctypes.c_void_p] + set_mppa_common_structures_fn(mppa_common_structures) + # Load kernel + load_kernel_fn = getattr(libloader.lib, "load_kernel") + load_kernel_fn() + self.loaded_kernels[module] = libloader + + @override + def get_module_function(self, module: Module, function_name: str) -> Callable: + if not self.mppa_initialized: + self.init_device() + if module not in self.loaded_kernels.keys(): + raise Exception("Kernel is not loaded") + func = getattr(self.loaded_kernels[module].lib, function_name) + assert func is not None, ( + f"Cannot find symbol {function_name} in lib {module.file_name}" + ) + return func + + @override + def unload_module(self, module: Module) -> None: + """Remove a loaded kernel from the cache""" + if not self.mppa_initialized: + self.init_device() + if module not in self.loaded_kernels.keys(): + raise Exception("Kernel is not loaded") + self.loaded_kernels[module].close() + self.loaded_kernels.pop(module) + + @override + def memory_allocate(self, size_bytes: int) -> Any: + assert self.lib_loader is not None + # Create a memory handle + create_memory_handle_fn = getattr( + self.lib_loader.lib, "mppa_create_memory_handle" + ) + create_memory_handle_fn.restype = ctypes.c_void_p + memory_handle = create_memory_handle_fn() + # Allocate memory + allocate_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_allocate") + allocate_memory_fn.argtypes = [ctypes.c_void_p, ctypes.c_size_t] + allocate_memory_fn.restype = ctypes.c_bool + if not allocate_memory_fn(memory_handle, size_bytes): + raise Exception("Failed to allocate memory") + return memory_handle + + @override + def memory_free(self, handle: Any) -> None: + if not self.mppa_initialized: + self.init_device() + assert self.lib_loader is not None + # Free memory + free_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_free") + free_memory_fn.argtypes = [ctypes.c_void_p] + free_memory_fn.restype = ctypes.c_bool + if not free_memory_fn(handle): + raise Exception("Failed to free memory") + # Destroy memory handle + destroy_memory_handle_fn = getattr( + self.lib_loader.lib, "mppa_destroy_memory_handle" + ) + destroy_memory_handle_fn.argtypes = [ctypes.c_void_p] + destroy_memory_handle_fn.restype = ctypes.c_bool + if not destroy_memory_handle_fn(handle): + raise Exception("Failed to destroy memory handle") + + @override + def memory_copy_to( + self, acc_handle: Any, src: ctypes.c_void_p, size_bytes: int + ) -> None: + if not self.mppa_initialized: + self.init_device() + assert self.lib_loader is not None + # Copy memory to accelerator device + copy_to_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_copy_to") + copy_to_memory_fn.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t] + copy_to_memory_fn.restype = ctypes.c_bool + if not copy_to_memory_fn(acc_handle, src, size_bytes): + raise Exception("Failed to copy memory to accelerator device") + + @override + def memory_copy_from( + self, acc_handle: Any, dst: ctypes.c_void_p, size_bytes: int + ) -> None: + if not self.mppa_initialized: + self.init_device() + assert self.lib_loader is not None + # Copy memory from accelerator device to host + copy_from_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_copy_from") + copy_from_memory_fn.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ] + copy_from_memory_fn.restype = ctypes.c_bool + if not copy_from_memory_fn(acc_handle, dst, size_bytes): + raise Exception("Failed to copy memory from accelerator device to host") + + @override + def memory_fill_zero(self, acc_handle: Any, size_bytes: int) -> None: + if not self.mppa_initialized: + self.init_device() + assert self.lib_loader is not None + fill_zero_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_fill_zero") + fill_zero_memory_fn.argtypes = [ctypes.c_void_p, ctypes.c_size_t] + fill_zero_memory_fn.restype = ctypes.c_bool + if not fill_zero_memory_fn(acc_handle, size_bytes): + raise Exception("Failed to fill memory with zeros") + + @override + def memory_data_pointer(self, acc_handle: Any) -> ctypes.c_void_p: + if not self.mppa_initialized: + self.init_device() + assert self.lib_loader is not None + # Get data pointer + get_data_pointer_fn = getattr(self.lib_loader.lib, "mppa_memory_data_pointer") + get_data_pointer_fn.argtypes = [ctypes.c_void_p] + get_data_pointer_fn.restype = ctypes.c_void_p + return get_data_pointer_fn(acc_handle) + + @override + def evaluate( + self, + results: Any, + repeat: int, + number: int, + nargs: int, + cfunc: CFunc, + args: Any, + ) -> None: + HostRuntime.get().evaluate( + results, + repeat, + number, + nargs, + cfunc, + args, + ) + + @override + def evaluate_perf( + self, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args_tuples: list[Any], + ) -> list[float]: + return HostRuntime.get().evaluate_perf( + pmu_events, + repeat, + number, + min_repeat_ms, + cfunc, + args_tuples, + ) + + @override + def evaluate_packed( + self, + results: Any, + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + raise NotImplementedError("evaluate_packed is not implemented for MPPA device") + + @override + def evaluate_packed_perf( + self, + results: Any, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + raise NotImplementedError( + "evaluate_packed_perf is not implemented for MPPA device" + ) + + @override + def cndarray_new( + self, + ndim: int, + shape: Any, + dtype: DLDataType, + device: DLDevice, + ) -> Any: + return HostRuntime.get().cndarray_new(ndim, shape, dtype, device) + + @override + def cndarray_del(self, handle: Any) -> None: + HostRuntime.get().cndarray_del(handle) + + @override + def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None: + HostRuntime.get().cndarray_copy_from_data(handle, data_handle) + + @override + def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None: + HostRuntime.get().cndarray_copy_to_data(handle, data_handle) + + @override + def evaluate_flops(self, dtype_name: str | bytes) -> float: + return HostRuntime.get().evaluate_flops(dtype_name) diff --git a/src/xtc/runtimes/accelerator/mppa/__init__.py b/src/xtc/runtimes/accelerator/mppa/__init__.py new file mode 100644 index 000000000..4fdcb768f --- /dev/null +++ b/src/xtc/runtimes/accelerator/mppa/__init__.py @@ -0,0 +1,8 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from .config import MppaConfig +from .MppaDevice import MppaDevice + +__all__ = ["MppaConfig", "MppaDevice"] diff --git a/src/xtc/runtimes/accelerator/mppa/config.py b/src/xtc/runtimes/accelerator/mppa/config.py new file mode 100644 index 000000000..5e7b1a3c1 --- /dev/null +++ b/src/xtc/runtimes/accelerator/mppa/config.py @@ -0,0 +1,161 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +import os +from typing import Any +from typing_extensions import override + +from xtc.backends.mlir.MlirConfig import MlirConfig + +VALID_PLATFORMS = ["hw", "iss", "qemu"] +VALID_ARCHS = ["kv3-1", "kv3-2"] +VALID_FIRMWARES = ["ocl_fw_l1.elf"] # TODO add other firmwares + +DEFAULT_WORK_DIR = "/tmp/" + os.getlogin() + "/mlir_mppa" +DEFAULT_PLATFORM = "hw" +DEFAULT_ARCH = "kv3-2" +DEFAULT_FIRMWARE = "ocl_fw_l1.elf" +DEFAULT_VERBOSE = True +DEFAULT_BUILD_VERBOSE = 0 +DEFAULT_BENCHMARK = False +DEFAULT_MPPA_TRACE_ENABLE = False +DEFAULT_MPPA_TRACE_USE_SYSCALL = True + + +class MppaConfig: + """A class to gather all configs""" + + def __init__(self, mlir_config: MlirConfig | None = None): + if mlir_config is None: + mlir_config = MlirConfig() + # Default Configuration + self.work_dir: str = DEFAULT_WORK_DIR + self.platform: str = get_platform() + self.arch: str = DEFAULT_ARCH + self.firmware: str = DEFAULT_FIRMWARE + self.verbose: bool = mlir_config.debug + self.build_verbose: int = mlir_config.debug + self.benchmark: bool = DEFAULT_BENCHMARK + self.mppa_trace_enable: bool = DEFAULT_MPPA_TRACE_ENABLE + self.mppa_trace_use_syscall: bool = DEFAULT_MPPA_TRACE_USE_SYSCALL + self.mlir_config: MlirConfig = mlir_config + # Read from env + self.set_platform(get_platform()) + self.set_benchmark(is_benchmark()) + self.set_mppa_trace_enable(mppa_trace_enable()) + self.set_mppa_trace_use_syscall(mppa_trace_use_syscall()) + if os.getenv("CLEAN_WORK_DIR", "0") in ["1", "true", "True"]: + self.clean_work_dir() + + def set_work_dir(self, work_dir: str) -> None: + if self.work_dir != "" and self.work_dir != "/": + self.work_dir = work_dir + + def set_platform(self, platform: str) -> None: + if platform in VALID_PLATFORMS: + self.platform = platform + + def set_arch(self, arch: str) -> None: + if arch in VALID_ARCHS: + self.arch = arch + + def set_firmware(self, firmware: str) -> None: + if firmware in VALID_FIRMWARES: + self.firmware = firmware + + def set_verbose(self, verbose: bool) -> None: + self.verbose = verbose + + def set_build_verbose(self, build_verbose: int) -> None: + self.build_verbose = build_verbose + + def set_benchmark(self, benchmark: bool) -> None: + self.benchmark = benchmark + + def set_mppa_trace_enable(self, mppa_trace_enable: bool) -> None: + self.mppa_trace_enable = mppa_trace_enable + + def set_mppa_trace_use_syscall(self, mppa_trace_use_syscall: bool) -> None: + self.mppa_trace_use_syscall = mppa_trace_use_syscall + + def clean_work_dir(self): + if os.path.exists(self.work_dir): + os.system("rm -r " + self.work_dir) + + @override + def __str__(self) -> str: + s = "Mppa configuration:\n" + s += " - work_dir: " + self.work_dir + "\n" + s += " - platform: " + self.platform + "\n" + s += " - arch: " + self.arch + "\n" + s += " - firmware: " + self.firmware + "\n" + s += " - verbose: " + str(self.verbose) + "\n" + s += " - build_verbose: " + str(self.build_verbose) + "\n" + s += " - benchmark: " + str(self.benchmark) + s += " - mppa_trace_enable: " + str(self.mppa_trace_enable) + "\n" + s += " - mppa_trace_use_syscall: " + str(self.mppa_trace_use_syscall) + "\n" + return s + + @override + def __eq__(self, other: Any) -> bool: + if not isinstance(other, MppaConfig): + return False + return ( + self.work_dir == other.work_dir + and self.platform == other.platform + and self.arch == other.arch + and self.firmware == other.firmware + and self.verbose == other.verbose + and self.build_verbose == other.build_verbose + and self.benchmark == other.benchmark + and self.mppa_trace_enable == other.mppa_trace_enable + and self.mppa_trace_use_syscall == other.mppa_trace_use_syscall + ) + + +# Creation of a MppaConfig from env + + +def get_platform() -> str: + platform = os.getenv("PLATFORM") + if platform is not None: + if platform in VALID_PLATFORMS: + return platform + else: + print(f"\033[91mUnknown platform: {platform}\033[0m") + exit(1) + return DEFAULT_PLATFORM + + +def is_benchmark(): + if "BENCHMARK" in os.environ: + if os.getenv("BENCHMARK") in ["1", "true", "True"]: + return True + else: + return False + return DEFAULT_BENCHMARK + + +def mppa_trace_enable(): + if "MPPA_TRACE_ENABLE" in os.environ: + if os.getenv("MPPA_TRACE_ENABLE") in ["1", "true", "True"]: + return True + else: + return False + if is_benchmark(): + return True + return DEFAULT_MPPA_TRACE_ENABLE + + +def mppa_trace_use_syscall(): + if "MPPA_TRACE_USE_SYSCALL" in os.environ: + if os.getenv("MPPA_TRACE_USE_SYSCALL") in ["1", "true", "True"]: + return True + else: + return False + return DEFAULT_MPPA_TRACE_USE_SYSCALL + + +def mppa_trace_use_oculink(): + return not mppa_trace_use_syscall() diff --git a/src/xtc/runtimes/gpu/runtime.py b/src/xtc/runtimes/gpu/runtime.py deleted file mode 100644 index 92a09d069..000000000 --- a/src/xtc/runtimes/gpu/runtime.py +++ /dev/null @@ -1,26 +0,0 @@ -# -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024-2026 The XTC Project Authors -# -import logging - -from xtc.runtimes.host.runtime import runtime_funcs, resolve_runtime, RuntimeType - -logger = logging.getLogger(__name__) - -# Can be set to True for RUNTIME_DEBUG -RUNTIME_DEBUG = False - -# GPU Runtime - - -def type() -> RuntimeType: - return RuntimeType.GPU - - -def __getattr__(x: str): - if x in runtime_funcs: - entries = resolve_runtime(RuntimeType.GPU) - assert entries is not None - return entries[x] - raise AttributeError(f"undefined runtime function: {x}") diff --git a/src/xtc/runtimes/host/HostRuntime.py b/src/xtc/runtimes/host/HostRuntime.py new file mode 100644 index 000000000..c4a6440d6 --- /dev/null +++ b/src/xtc/runtimes/host/HostRuntime.py @@ -0,0 +1,210 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +import ctypes +import logging +from typing import Any, Callable +from typing_extensions import override + +from xtc.runtimes.types.dlpack import DLDevice, DLDataType + +from xtc.utils.cfunc import CFunc, _str_list_to_c, _c_ascii_str +from xtc.itf.runtime.common import CommonRuntimeInterface + +from .runtime import runtime_funcs, resolve_runtime, RuntimeType + +__all__ = ["HostRuntime"] + +logger = logging.getLogger(__name__) + +# Can be set to True for RUNTIME_DEBUG +RUNTIME_DEBUG = False + + +class HostRuntime(CommonRuntimeInterface): + """A class for Host runtime""" + + # This is a singleton class; only one instance of HostRuntime will ever be created. + _instance = None + + def __new__(cls, *args: Any, **kwargs: Any) -> "HostRuntime": + if cls._instance is None: + cls._instance = super(HostRuntime, cls).__new__(cls) + return cls._instance + + def __init__(self): + pass + + def __del__(self): + self._instance = None + + def __get_runtime_func(self, name: str) -> Callable: + if name in runtime_funcs: + entries = resolve_runtime(RuntimeType.HOST) + assert entries is not None + return entries[name] + raise AttributeError(f"undefined runtime function: {name}") + + def __getattr__(self, name: str) -> Callable: + return self.__get_runtime_func(name) + + @classmethod + def get(cls) -> "HostRuntime": + if cls._instance is None: + cls._instance = HostRuntime() + return cls._instance + + @override + def target_name(self) -> str: + return "host" + + @override + def device_name(self) -> str: + return "host" + + @override + def device_arch(self) -> str: + return "host" + + @override + def device_id(self) -> int: + return 0 + + @override + def evaluate( + self, + results: Any, + repeat: int, + number: int, + nargs: int, + cfunc: CFunc, + args: Any, + ) -> None: + self.__get_runtime_func("evaluate")( + ctypes.cast(results, ctypes.POINTER(ctypes.c_double)), + ctypes.c_int(repeat), + ctypes.c_int(number), + ctypes.c_int(nargs), + ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), + ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)), + ) + + @override + def evaluate_perf( + self, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args_tuples: list[Any], + ) -> list[float]: + args_array = (ctypes.c_voidp * len(args_tuples))( + *[arg[0] for arg in args_tuples] + ) + values_num = 1 + if len(pmu_events) > 0: + values_num = len(pmu_events) + # FIXME check if the PMU events are supported by the target + results_array = (ctypes.c_double * (repeat * values_num))() + self.__get_runtime_func("evaluate_perf")( + ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)), + ctypes.c_int(len(pmu_events)), + _str_list_to_c(pmu_events), + ctypes.c_int(repeat), + ctypes.c_int(number), + ctypes.c_int(min_repeat_ms), + ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), + ctypes.cast(args_array, ctypes.POINTER(ctypes.c_voidp)), + ctypes.c_int(len(args_tuples)), + ) + return [float(x) for x in results_array] + + @override + def evaluate_packed( + self, + results: Any, + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + self.__get_runtime_func("evaluate_packed")( + ctypes.cast(results, ctypes.POINTER(ctypes.c_double)), + ctypes.c_int(repeat), + ctypes.c_int(number), + ctypes.c_int(min_repeat_ms), + ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), + ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)), + ctypes.cast(codes, ctypes.POINTER(ctypes.c_int)), + ctypes.c_int(nargs), + ) + + @override + def evaluate_packed_perf( + self, + results: Any, + pmu_events: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + cfunc: CFunc, + args: Any, + codes: Any, + nargs: int, + ) -> None: + self.__get_runtime_func("evaluate_packed_perf")( + ctypes.cast(results, ctypes.POINTER(ctypes.c_double)), + ctypes.c_int(len(pmu_events)), + _str_list_to_c(pmu_events), + ctypes.c_int(repeat), + ctypes.c_int(number), + ctypes.c_int(min_repeat_ms), + ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), + ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)), + ctypes.cast(codes, ctypes.POINTER(ctypes.c_int)), + ctypes.c_int(nargs), + ) + + @override + def cndarray_new( + self, + ndim: int, + shape: Any, + dtype: DLDataType, + device: DLDevice, + ) -> Any: + # Convert shape if it's a list/tuple to ctypes array + if isinstance(shape, (list, tuple)): + shape_array = (ctypes.c_int64 * len(shape))(*shape) + shape = ctypes.cast(shape_array, ctypes.POINTER(ctypes.c_int64)) + return self.__get_runtime_func("cndarray_new")( + ctypes.c_int32(ndim), + shape, + dtype, + device, + ) + + @override + def cndarray_del(self, handle: Any) -> None: + self.__get_runtime_func("cndarray_del")(handle) + + @override + def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None: + self.__get_runtime_func("cndarray_copy_from_data")(handle, data_handle) + + @override + def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None: + self.__get_runtime_func("cndarray_copy_to_data")(handle, data_handle) + + @override + def evaluate_flops(self, dtype_name: str | bytes) -> float: + return float( + self.__get_runtime_func("evaluate_flops")( + _c_ascii_str.from_param(dtype_name) + ) + ) diff --git a/src/xtc/runtimes/host/__init__.py b/src/xtc/runtimes/host/__init__.py index 4a40722d1..ab0e1dd81 100644 --- a/src/xtc/runtimes/host/__init__.py +++ b/src/xtc/runtimes/host/__init__.py @@ -2,3 +2,6 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024-2026 The XTC Project Authors # +from .HostRuntime import HostRuntime + +__all__ = ["HostRuntime"] diff --git a/src/xtc/runtimes/host/evaluate.py b/src/xtc/runtimes/host/evaluate.py deleted file mode 100644 index c15e3f96a..000000000 --- a/src/xtc/runtimes/host/evaluate.py +++ /dev/null @@ -1,75 +0,0 @@ -# -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024-2026 The XTC Project Authors -# -from typing import Any -from types import ModuleType -import numpy as np -import numpy.typing -from pathlib import Path - -from xtc.utils.loader import LibLoader -from xtc.runtimes.types.ndarray import NDArray - -from .evaluator import Executor, Evaluator - - -def load_and_evaluate( - runtime: ModuleType, - module_file: str, - module_name: str, - payload_name: str, - **kwargs: Any, -) -> tuple[list[float], int, str]: - bare_ptr = kwargs.get("bare_ptr", True) - dll = str(Path(module_file).absolute()) - sym = payload_name - parameters: tuple[list[NDArray], list[NDArray]] = kwargs.get("parameters", []) - ref_outputs: list[numpy.typing.NDArray] = kwargs.get("ref_outputs", []) - validate = kwargs.get("validate", False) - repeat = kwargs.get("repeat", 1) - number = kwargs.get("number", 1) - min_repeat_ms = kwargs.get("min_repeat_ms", 0) - pmu_counters = kwargs.get("pmu_counters", []) - with LibLoader(dll) as lib: - func = getattr(lib, sym) - assert func is not None, f"Cannot find symbol {sym} in lib {dll}" - func.packed = not bare_ptr - if validate: - exec_func = Executor(func) - exec_func(*parameters[0], *parameters[1]) - for out_ref, out in zip( - ref_outputs, [out.numpy() for out in parameters[1]] - ): - if not np.allclose(out_ref, out): - return [], 1, "Error in validation: outputs differ" - eval_func = Evaluator( - func, - runtime, - repeat=repeat, - min_repeat_ms=min_repeat_ms, - number=number, - pmu_counters=pmu_counters, - ) - results = eval_func(*parameters[0], *parameters[1]) - return results, 0, "" - - -def load_and_execute( - runtime: ModuleType, - module_file: str, - module_name: str, - payload_name: str, - **kwargs: Any, -) -> int: - _, code, _ = load_and_evaluate( - runtime, - module_file, - module_name, - payload_name, - repeat=1, - min_repeat_ms=0, - number=1, - **kwargs, - ) - return code diff --git a/src/xtc/runtimes/host/evaluator.py b/src/xtc/runtimes/host/evaluator.py deleted file mode 100644 index 4c49c760a..000000000 --- a/src/xtc/runtimes/host/evaluator.py +++ /dev/null @@ -1,180 +0,0 @@ -# -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024-2026 The XTC Project Authors -# -from typing import Any -from types import ModuleType -import ctypes - -from xtc.runtimes.host.runtime import RuntimeType - -__all__ = [ - "Evaluator", - "Executor", -] - - -class ArgTypeCode: - INT = 0 - HANDLE = 3 - NDARRAY_HANDLE = 13 - - -CArgCode = ctypes.c_int - - -class CArgValue(ctypes.Union): - _fields_ = [ - ("v_int64", ctypes.c_int64), - ("v_float64", ctypes.c_double), - ("v_handle", ctypes.c_void_p), - ("v_str", ctypes.c_char_p), - ] - - -class CRetValue(CArgValue): - pass - - -CPackedFunc = ctypes.CFUNCTYPE( - ctypes.c_int, - ctypes.POINTER(CArgValue), - ctypes.POINTER(CArgCode), - ctypes.c_int, - ctypes.POINTER(CRetValue), - ctypes.POINTER(CArgCode), -) - - -class CFunc: - def __init__(self, f: Any, packed: bool = False) -> None: - self.handle = f - self.is_packed = packed or ( - hasattr(self.handle, "packed") and self.handle.packed - ) - - def arg_tuple(self, arg: Any) -> Any: - if arg.__class__.__name__ == "ndarray": # Numpy Array - assert not self.is_packed - return (arg.ctypes.data_as(ctypes.c_voidp), ArgTypeCode.HANDLE) - elif arg.__class__.__name__ == "NDArray": # TVM NDArray or our NDArray - if self.is_packed: - return ( - CArgValue(v_handle=ctypes.cast(arg.handle, ctypes.c_void_p)), - ArgTypeCode.NDARRAY_HANDLE, - ) - else: - return ( - ctypes.cast(arg.handle.contents.dl_tensor.data, ctypes.c_void_p), - ArgTypeCode.HANDLE, - ) - else: - assert 0, f"Unsupported argument class: {arg.__class__.__name__}" - - def args_tuples(self, args: Any) -> list[Any]: - return [self.arg_tuple(arg) for arg in args] - - def __call__(self, *args: Any): - args_tuples = self.args_tuples(args) - if self.is_packed: - args_array = (CArgValue * len(args_tuples))( - *[arg[0] for arg in args_tuples] - ) - args_codes = (CArgCode * len(args_tuples))(*[arg[1] for arg in args_tuples]) - result_val = CRetValue(0) - result_code = CArgCode(ArgTypeCode.INT) - res = CPackedFunc(self.handle)( - args_array, - args_codes, - len(args_tuples), - ctypes.byref(result_val), - ctypes.byref(result_code), - ctypes.c_int(len(args_tuples)), - ) - assert res == 0, f"error calling packed function" - else: - data_args = [arg[0] for arg in args_tuples] - self.handle(*data_args) - - -class Evaluator: - def __init__( - self, - f: Any, - runtime: ModuleType, - repeat: int = 1, - number: int = 1, - min_repeat_ms: int = 0, - pmu_counters: list[str] = [], - ) -> None: - assert repeat > 0 - assert number > 0 - assert min_repeat_ms >= 0 - self.repeat = repeat - self.number = number - self.min_repeat_ms = min_repeat_ms - self.pmu_counters = pmu_counters - self.runtime = runtime - self.cfunc = CFunc(f) - - def _str_list_to_c(self, str_list: list[str]) -> Any: - return (ctypes.c_char_p * len(str_list))( - *[str.encode("utf-8") for str in str_list] - ) - - def __call__(self, *args: Any) -> list[float]: - args_tuples = self.cfunc.args_tuples(args) - values_num = 1 - if len(self.pmu_counters) > 0: - values_num = len(self.pmu_counters) - if ( - any(counter.startswith("gpu.") for counter in self.pmu_counters) - and self.runtime.type() != RuntimeType.GPU - ): - raise ValueError( - "GPU PMU counters are not requested but target is not a GPU." - ) - results_array = (ctypes.c_double * (self.repeat * values_num))() - if self.cfunc.is_packed: - args_array_packed = (CArgValue * len(args_tuples))( - *[arg[0] for arg in args_tuples] - ) - args_codes_packed = (CArgCode * len(args_tuples))( - *[arg[1] for arg in args_tuples] - ) - self.runtime.evaluate_packed_perf( - ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)), - ctypes.c_int(len(self.pmu_counters)), - self._str_list_to_c(self.pmu_counters), - ctypes.c_int(self.repeat), - ctypes.c_int(self.number), - ctypes.c_int(self.min_repeat_ms), - ctypes.cast(self.cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), - ctypes.cast(args_array_packed, ctypes.POINTER(ctypes.c_voidp)), - ctypes.cast(args_codes_packed, ctypes.POINTER(ctypes.c_int)), - ctypes.c_int(len(args_tuples)), - ) - else: - args_array = (ctypes.c_voidp * len(args_tuples))( - *[arg[0] for arg in args_tuples] - ) - self.runtime.evaluate_perf( - ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)), - ctypes.c_int(len(self.pmu_counters)), - self._str_list_to_c(self.pmu_counters), - ctypes.c_int(self.repeat), - ctypes.c_int(self.number), - ctypes.c_int(self.min_repeat_ms), - ctypes.cast(self.cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)), - ctypes.cast(args_array, ctypes.POINTER(ctypes.c_voidp)), - ctypes.c_int(len(args_tuples)), - ) - return [float(x) for x in results_array] - - -class Executor: - def __init__(self, f: Any) -> None: - self.func = CFunc(f) - - def __call__(self, *args: Any) -> None: - self.func(*args) diff --git a/src/xtc/runtimes/host/runtime.py b/src/xtc/runtimes/host/runtime.py index 17cf41db4..b1976cce6 100644 --- a/src/xtc/runtimes/host/runtime.py +++ b/src/xtc/runtimes/host/runtime.py @@ -175,7 +175,7 @@ def _compile_runtime(out_dll: str, tdir: str, runtime_type: RuntimeType): for i, file in enumerate(src_files): cmd = ( "cc -c -O2 -march=native -fPIC " - f"-I{src_dir} {debug_opts} {pfm_opts} {gpu_opts} -I{src_dir}/../gpu " + f"-I{src_dir} {debug_opts} {pfm_opts} {gpu_opts} -I{src_dir}/../accelerator/gpu " f"-o {obj_files[i]} {file}" ) logger.debug("Compiling runtime: %s", cmd) @@ -207,7 +207,7 @@ def _compile_runtime_gpu_extension(out_lib: str, tdir: str): "perf_event_gpu.cpp", ] top_dir = Path(__file__).parents[2] - src_dir = top_dir / "csrcs" / "runtimes" / "gpu" + src_dir = top_dir / "csrcs" / "runtimes" / "accelerator" / "gpu" src_files = [f"{src_dir}/{file}" for file in files] # Compile @@ -215,7 +215,7 @@ def _compile_runtime_gpu_extension(out_lib: str, tdir: str): obj_file = f"{tdir}/perf_event_gpu.o" cmd = ( "c++ -c -O2 -march=native -fPIC " - f"-I{src_dir} {debug_opts} -I{src_dir}/../host " + f"-I{src_dir} {debug_opts} -I{src_dir}/../../host " f"-I{cuda_install_dir}/include " f"-o {obj_file} {' '.join(src_files)}" ) @@ -274,18 +274,3 @@ def resolve_runtime(runtime_type: RuntimeType): ) _runtime_entries[runtime_type.value] = entries return _runtime_entries[runtime_type.value] - - -# Host Runtime - - -def type() -> RuntimeType: - return RuntimeType.HOST - - -def __getattr__(x: str): - if x in runtime_funcs: - entries = resolve_runtime(RuntimeType.HOST) - assert entries is not None - return entries[x] - raise AttributeError(f"undefined runtime function: {x}") diff --git a/src/xtc/runtimes/types/ndarray.py b/src/xtc/runtimes/types/ndarray.py index 0148d807c..351f4ecd1 100644 --- a/src/xtc/runtimes/types/ndarray.py +++ b/src/xtc/runtimes/types/ndarray.py @@ -5,6 +5,7 @@ from typing import Any import ctypes import numpy as np +from enum import Enum __all__ = [ "NDArray", @@ -12,7 +13,15 @@ from .dlpack import DLDevice, DLDeviceTypeCode, DLDataType, DLDataTypeCode, CNDArray -import xtc.runtimes.host.runtime as runtime +from xtc.runtimes.host.HostRuntime import HostRuntime + +from xtc.itf.runtime.common import CommonRuntimeInterface +from xtc.itf.runtime.accelerator import AcceleratorDevice + + +class NDArrayLocation(Enum): + HOST = 0 + DEVICE = 1 class NDArray: @@ -30,19 +39,28 @@ class NDArray: } rev_np_dtype_map: dict[tuple[int, int], str] = {} - def __init__(self, array: Any) -> None: + def __init__( + self, array: Any, runtime: CommonRuntimeInterface | None = None + ) -> None: if not self.rev_np_dtype_map: self.rev_np_dtype_map.update( {v: k for k, v in NDArray.np_dtype_map.items()} ) self.handle = None + self.device_handle = None + self.runtime = runtime + if self.runtime is None: + self.runtime = HostRuntime() + self.location = NDArrayLocation.HOST if isinstance(array, NDArray): raise RuntimeError("TODO: copy from CNDArray not supported yet") elif isinstance(array, np.ndarray): self._from_numpy(array) else: assert 0 + if isinstance(self.runtime, AcceleratorDevice): + self._to_device() def _from_numpy(self, nparray: np.ndarray) -> None: assert nparray.flags["C_CONTIGUOUS"] @@ -64,11 +82,48 @@ def _copy_to_numpy(self, out: np.ndarray) -> np.ndarray: return out def numpy(self, out: np.ndarray | None = None) -> np.ndarray: + if self.is_on_device(): + assert isinstance(self.runtime, AcceleratorDevice) + assert self.handle is not None + bytes_size = self.size * self.dtype.itemsize + self.runtime.memory_copy_from( + self.device_handle, self.handle.contents.dl_tensor.data, bytes_size + ) if out is None: return self._to_numpy() else: return self._copy_to_numpy(out) + def _to_device(self) -> None: + assert ( + isinstance(self.runtime, AcceleratorDevice) + and self.location == NDArrayLocation.HOST + ) + assert self.handle is not None + bytes_size = self.size * self.dtype.itemsize + self.device_handle = self.runtime.memory_allocate(bytes_size) + self.runtime.memory_copy_to( + self.device_handle, self.handle.contents.dl_tensor.data, bytes_size + ) + self.location = NDArrayLocation.DEVICE + + def _from_device(self) -> None: + assert ( + isinstance(self.runtime, AcceleratorDevice) + and self.location == NDArrayLocation.DEVICE + ) + assert self.handle is not None + bytes_size = self.size * self.dtype.itemsize + self.runtime.memory_copy_from( + self.device_handle, self.handle.contents.dl_tensor.data, bytes_size + ) + self.runtime.memory_free(self.device_handle) + self.device_handle = None + self.location = NDArrayLocation.HOST + + def is_on_device(self) -> bool: + return self.location == NDArrayLocation.DEVICE + @property def dtype_str(self) -> str: assert self.handle is not None @@ -103,18 +158,21 @@ def size(self) -> int: @property def data(self) -> Any: assert self.handle is not None + if self.is_on_device(): + assert isinstance(self.runtime, AcceleratorDevice) + return self.runtime.memory_data_pointer(self.device_handle) return self.handle.contents.dl_tensor.data @classmethod def _copy_from(cls, handle: Any, data_handle: Any) -> None: - runtime.cndarray_copy_from_data( + HostRuntime.get().cndarray_copy_from_data( handle, data_handle, ) @classmethod def _copy_to(cls, handle: Any, data_handle: Any) -> None: - runtime.cndarray_copy_to_data( + HostRuntime.get().cndarray_copy_to_data( handle, data_handle, ) @@ -132,7 +190,7 @@ def _new( device = DLDevice(DLDeviceTypeCode.kDLCPU, 0) shape_array = (ctypes.c_int64 * len(shape))(*shape) dldtype = cls._dldatatype(np_dtype) - handle = runtime.cndarray_new( + handle = HostRuntime.get().cndarray_new( len(shape), ctypes.cast(shape_array, ctypes.POINTER(ctypes.c_int64)), dldtype, @@ -145,9 +203,14 @@ def _new( def __del__(self) -> None: if self.handle is not None: - runtime.cndarray_del(self.handle) + assert self.runtime is not None + self.runtime.cndarray_del(self.handle) self.handle = None + if self.device_handle is not None: + assert isinstance(self.runtime, AcceleratorDevice) + self.runtime.memory_free(self.device_handle) + self.device_handle = None @classmethod def set_alloc_alignment(cls, alignment: int) -> None: - runtime.cndarray_set_alloc_alignment(alignment) + HostRuntime.get().cndarray_set_alloc_alignment(alignment) diff --git a/src/xtc/targets/accelerator/gpu/GPUEvaluator.py b/src/xtc/targets/accelerator/gpu/GPUEvaluator.py new file mode 100644 index 000000000..7da994e54 --- /dev/null +++ b/src/xtc/targets/accelerator/gpu/GPUEvaluator.py @@ -0,0 +1,138 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing import Any +from typing_extensions import override + +from xtc.runtimes.accelerator.gpu.GPUDevice import GPUDevice +import xtc.targets.accelerator.gpu as gpu +import xtc.itf as itf +from xtc.utils.evaluation import ( + ensure_ndarray_parameters, + validate_outputs, + evaluate_performance, + copy_outputs, +) + +__all__ = [ + "GPUEvaluator", + "GPUExecutor", +] + + +class GPUEvaluator(itf.exec.Evaluator): + def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None: + self._device = GPUDevice() + self._module = module + self._repeat = kwargs.get("repeat", 1) + self._min_repeat_ms = kwargs.get("min_repeat_ms", 100) + self._number = kwargs.get("number", 1) + self._validate = kwargs.get("validate", False) + self._parameters = kwargs.get("parameters") + self._init_zero = kwargs.get("init_zero", False) + self._np_inputs_spec = kwargs.get( + "np_inputs_spec", self._module._np_inputs_spec + ) + self._np_outputs_spec = kwargs.get( + "np_outputs_spec", self._module._np_outputs_spec + ) + self._reference_impl = kwargs.get( + "reference_impl", self._module._reference_impl + ) + self._pmu_counters = kwargs.get("pmu_counters", []) + + assert self._module.file_type == "shlib", "only support shlib for evaluation" + + @override + def evaluate(self) -> tuple[list[float], int, str]: + assert self._module._bare_ptr, "bare_ptr is not supported for evaluation" + + # Initialize the device and load the module + self._device.init_device() + self._device.load_module(self._module) + sym = self._module.payload_name + func = self._device.get_module_function(self._module, sym) + results: tuple[list[float], int, str] = ([], 0, "") + validation_failed = False + + # Prepare the parameters + parameters = ensure_ndarray_parameters( + self._parameters, + self._np_inputs_spec, + self._np_outputs_spec, + self._init_zero, + ) + + # Map the buffers + # TODO Replace memory mapping of buffers by explicit transfers + for i, buffer in enumerate(parameters[0]): + if self._np_inputs_spec()[i]["device"] is None: + self._device._register_buffer( + buffer.data, buffer.size * buffer.dtype.itemsize + ) + for i, buffer in enumerate(parameters[1]): + if self._np_outputs_spec()[i]["device"] is None: + self._device._register_buffer( + buffer.data, buffer.size * buffer.dtype.itemsize + ) + + # Check the correctness of the outputs + if self._validate: + results = validate_outputs(func, parameters, self._reference_impl) + validation_failed = results[1] != 0 + + # Measure the performance + if not validation_failed: + results = evaluate_performance( + func, + parameters, + self._pmu_counters, + self._repeat, + self._number, + self._min_repeat_ms, + self._device, + ) + + # Unmap the buffers + for i, buffer in enumerate(parameters[0]): + if self._np_inputs_spec()[i]["device"] is None: + self._device._unregister_buffer(buffer.data) + for i, buffer in enumerate(parameters[1]): + if self._np_outputs_spec()[i]["device"] is None: + self._device._unregister_buffer(buffer.data) + + # Unload the module + self._device.unload_module(self._module) + + # Copy out outputs + if self._parameters is not None: + copy_outputs(parameters, self._parameters) + + return results + + @property + @override + def module(self) -> itf.comp.Module: + return self._module + + +class GPUExecutor(itf.exec.Executor): + def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None: + self._evaluator = GPUEvaluator( + module=module, + repeat=1, + min_repeat_ms=0, + number=1, + **kwargs, + ) + + @override + def execute(self) -> int: + results, code, err_msg = self._evaluator.evaluate() + return code + + @property + @override + def module(self) -> itf.comp.Module: + return self._evaluator.module diff --git a/src/xtc/targets/accelerator/gpu/GPUModule.py b/src/xtc/targets/accelerator/gpu/GPUModule.py new file mode 100644 index 000000000..9fd27f3c7 --- /dev/null +++ b/src/xtc/targets/accelerator/gpu/GPUModule.py @@ -0,0 +1,89 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing import Any +from typing_extensions import override + +import xtc.itf as itf +from xtc.itf.graph import Graph +from xtc.utils.evaluation import ( + graph_np_inputs_spec, + graph_np_outputs_spec, + graph_reference_impl, +) +from .GPUEvaluator import GPUExecutor, GPUEvaluator + + +__all__ = [ + "GPUModule", +] + + +class GPUModule(itf.comp.Module): + def __init__( + self, + name: str, + payload_name: str, + file_name: str, + file_type: str, + graph: Graph | None = None, + **kwargs: Any, + ) -> None: + self._name = name + self._payload_name = payload_name + self._file_name = file_name + self._file_type = file_type + assert self._file_type == "shlib", "only support shlib for JIR Module" + lib_suffixes = ("so", "dylib") + assert self._file_name.endswith(lib_suffixes), ( + f"file name {self._file_name} is not a shlib" + ) + self._bare_ptr = kwargs.get("bare_ptr", True) + self._graph = graph + if self._graph is not None: + self._np_inputs_spec = graph_np_inputs_spec(self._graph) + self._np_outputs_spec = graph_np_outputs_spec(self._graph) + self._reference_impl = graph_reference_impl(self._graph) + else: + self._np_inputs_spec = kwargs.get("np_inputs_spec") + self._np_outputs_spec = kwargs.get("np_outputs_spec") + self._reference_impl = kwargs.get("reference_impl") + + @property + @override + def file_type(self) -> str: + return self._file_type + + @property + @override + def name(self) -> str: + return self._name + + @property + @override + def payload_name(self) -> str: + return self._payload_name + + @property + @override + def file_name(self) -> str: + return self._file_name + + @override + def export(self) -> None: + raise NotImplementedError("GPUModule.export is not implemented") + + @override + def get_evaluator(self, **kwargs: Any) -> itf.exec.Evaluator: + return GPUEvaluator( + self, + **kwargs, + ) + + @override + def get_executor(self, **kwargs: Any) -> itf.exec.Executor: + return GPUExecutor( + self, + **kwargs, + ) diff --git a/src/xtc/targets/gpu/__init__.py b/src/xtc/targets/accelerator/gpu/__init__.py similarity index 100% rename from src/xtc/targets/gpu/__init__.py rename to src/xtc/targets/accelerator/gpu/__init__.py diff --git a/src/xtc/targets/accelerator/mppa/MppaEvaluator.py b/src/xtc/targets/accelerator/mppa/MppaEvaluator.py new file mode 100644 index 000000000..9baf22042 --- /dev/null +++ b/src/xtc/targets/accelerator/mppa/MppaEvaluator.py @@ -0,0 +1,117 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing import Any +from typing_extensions import override + +import xtc.targets.accelerator.mppa as mppa +import xtc.itf as itf +from xtc.runtimes.accelerator.mppa import MppaDevice +from xtc.utils.evaluation import ( + ensure_ndarray_parameters, + validate_outputs, + evaluate_performance, + copy_outputs, +) + +__all__ = [ + "MppaEvaluator", + "MppaExecutor", +] + + +class MppaEvaluator(itf.exec.Evaluator): + def __init__(self, module: "mppa.MppaModule", **kwargs: Any) -> None: + self._device = MppaDevice(module._mppa_config) + self._module = module + self._repeat = kwargs.get("repeat", 1) + self._min_repeat_ms = kwargs.get("min_repeat_ms", 100) + self._number = kwargs.get("number", 1) + self._validate = kwargs.get("validate", False) + self._parameters = kwargs.get("parameters") + self._init_zero = kwargs.get("init_zero", False) + self._np_inputs_spec = kwargs.get( + "np_inputs_spec", self._module._np_inputs_spec + ) + self._np_outputs_spec = kwargs.get( + "np_outputs_spec", self._module._np_outputs_spec + ) + self._reference_impl = kwargs.get( + "reference_impl", self._module._reference_impl + ) + self._pmu_counters = kwargs.get("pmu_counters", []) + + assert self._module.file_type == "shlib", "only support shlib for evaluation" + + @override + def evaluate(self) -> tuple[list[float], int, str]: + assert self._module._bare_ptr, "bare_ptr is not supported for evaluation" + + # Initialize the device and load the module + self._device.init_device() + self._device.load_module(self._module) + sym = self._module.payload_name + func = self._device.get_module_function(self._module, sym) + results: tuple[list[float], int, str] = ([], 0, "") + validation_failed = False + + # Prepare the parameters + parameters = ensure_ndarray_parameters( + self._parameters, + self._np_inputs_spec, + self._np_outputs_spec, + self._init_zero, + ) + + # Check the correctness of the outputs + if self._validate: + results = validate_outputs(func, parameters, self._reference_impl) + validation_failed = results[1] != 0 + + # Measure the performance + if not validation_failed: + results = evaluate_performance( + func, + parameters, + self._pmu_counters, + self._repeat, + self._number, + self._min_repeat_ms, + self._device, + ) + + # Unload the module + self._device.unload_module(self._module) + + # Copy out outputs + if self._parameters is not None: + copy_outputs(parameters, self._parameters) + + return results + + @property + @override + def module(self) -> itf.comp.Module: + return self._module + + +class MppaExecutor(itf.exec.Executor): + def __init__(self, module: "mppa.MppaModule", **kwargs: Any) -> None: + self._evaluator = MppaEvaluator( + module=module, + repeat=1, + min_repeat_ms=0, + number=1, + **kwargs, + ) + + @override + def execute(self) -> int: + results, code, err_msg = self._evaluator.evaluate() + return code + + @property + @override + def module(self) -> itf.comp.Module: + return self._evaluator.module diff --git a/src/xtc/targets/accelerator/mppa/MppaModule.py b/src/xtc/targets/accelerator/mppa/MppaModule.py new file mode 100644 index 000000000..746329e16 --- /dev/null +++ b/src/xtc/targets/accelerator/mppa/MppaModule.py @@ -0,0 +1,94 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing import Any, cast +from typing_extensions import override + +from xtc.itf.graph import Graph +import xtc.itf as itf + +from .MppaEvaluator import MppaEvaluator, MppaExecutor +from xtc.runtimes.accelerator.mppa import MppaConfig + +from xtc.utils.evaluation import ( + graph_np_inputs_spec, + graph_np_outputs_spec, + graph_reference_impl, +) + +__all__ = [ + "MppaModule", +] + + +class MppaModule(itf.comp.Module): + def __init__( + self, + name: str, + payload_name: str, + file_name: str, + file_type: str, + mppa_config: MppaConfig, + graph: Graph | None = None, + **kwargs: Any, + ) -> None: + self._name = name + self._payload_name = payload_name + self._file_name = file_name + self._file_type = file_type + assert self._file_type == "shlib", "only support shlib for JIR Module" + lib_suffixes = ("so", "dylib") + assert self._file_name.endswith(lib_suffixes), ( + f"file name {self._file_name} is not a shlib" + ) + self._bare_ptr = kwargs.get("bare_ptr", True) + self._graph = graph + if self._graph is not None: + self._np_inputs_spec = graph_np_inputs_spec(self._graph) + self._np_outputs_spec = graph_np_outputs_spec(self._graph) + self._reference_impl = graph_reference_impl(self._graph) + else: + self._np_inputs_spec = kwargs.get("np_inputs_spec") + self._np_outputs_spec = kwargs.get("np_outputs_spec") + self._reference_impl = kwargs.get("reference_impl") + self._mppa_config = mppa_config # FIXME: remove passing config to module + # TODO Handle shlib of multiple kernels on fly + + @property + @override + def file_type(self) -> str: + return self._file_type + + @property + @override + def name(self) -> str: + return self._name + + @property + @override + def payload_name(self) -> str: + return self._payload_name + + @property + @override + def file_name(self) -> str: + return self._file_name + + @override + def export(self) -> None: + raise NotImplementedError("AcceleratorModule.export is not implemented") + + @override + def get_evaluator(self, **kwargs: Any) -> itf.exec.Evaluator: + return MppaEvaluator( + self, + **kwargs, + ) + + @override + def get_executor(self, **kwargs: Any) -> itf.exec.Executor: + return MppaExecutor( + self, + **kwargs, + ) diff --git a/src/xtc/targets/accelerator/mppa/__init__.py b/src/xtc/targets/accelerator/mppa/__init__.py new file mode 100644 index 000000000..5ab2c51d8 --- /dev/null +++ b/src/xtc/targets/accelerator/mppa/__init__.py @@ -0,0 +1,6 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from .MppaModule import MppaModule +from .MppaEvaluator import MppaEvaluator, MppaExecutor diff --git a/src/xtc/targets/gpu/GPUEvaluator.py b/src/xtc/targets/gpu/GPUEvaluator.py deleted file mode 100644 index 1017d4063..000000000 --- a/src/xtc/targets/gpu/GPUEvaluator.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024-2026 The XTC Project Authors -# -from typing import Any -from typing_extensions import override -import numpy as np -import ctypes - -from xtc.runtimes.types.ndarray import NDArray -import xtc.runtimes.gpu.runtime as gpu_runtime -from xtc.utils.numpy import ( - np_init, -) -from xtc.utils.tools import get_mlir_prefix -from xtc.utils.loader import LibLoader -from xtc.utils.ext_tools import cuda_runtime_lib - -import xtc.itf as itf -import xtc.targets.gpu as gpu -from xtc.targets.host import HostEvaluator, HostExecutor - - -__all__ = [ - "GPUEvaluator", - "GPUExecutor", -] - - -class GPUEvaluator(HostEvaluator): - def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None: - self._runtime_lib = LibLoader(f"{get_mlir_prefix()}/lib/{cuda_runtime_lib}") - kwargs["register_buffer_fn"] = self._register_buffer - kwargs["unregister_buffer_fn"] = self._unregister_buffer - kwargs["runtime"] = gpu_runtime - super().__init__(module, **kwargs) - - def __exit(self, exc_type, exc_value, traceback) -> None: - runtime_lib.close() - - def _register_buffer(self, buffer: NDArray) -> None: - nb_bytes = buffer.size * buffer.dtype.itemsize - nb_bytes_c = ctypes.c_int64(nb_bytes) - buffer_ptr = ctypes.cast(buffer.data, ctypes.c_void_p) - func_name = "mgpuMemHostRegister" - func = getattr(self._runtime_lib.lib, func_name) - assert func is not None, ( - f"Cannot find symbol {func_name} in lib {self._runtime_lib.lib}" - ) - func(buffer_ptr, nb_bytes_c) - - def _unregister_buffer(self, buffer: NDArray) -> None: - buffer_ptr = ctypes.cast(buffer.data, ctypes.c_void_p) - func_name = "mgpuMemHostUnregister" - func = getattr(self._runtime_lib.lib, func_name) - assert func is not None, ( - f"Cannot find symbol {func_name} in lib {self._runtime_lib.lib}" - ) - func(buffer_ptr) - - -class GPUExecutor(HostExecutor): - def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None: - self._evaluator = GPUEvaluator( - module=module, - repeat=1, - min_repeat_ms=0, - number=1, - **kwargs, - ) diff --git a/src/xtc/targets/gpu/GPUModule.py b/src/xtc/targets/gpu/GPUModule.py deleted file mode 100644 index 8948db05a..000000000 --- a/src/xtc/targets/gpu/GPUModule.py +++ /dev/null @@ -1,47 +0,0 @@ -# -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024-2026 The XTC Project Authors -# -from typing import Any, cast -from typing_extensions import override - -import xtc.itf as itf -from xtc.itf.graph import Graph -from xtc.graphs.xtc.graph import XTCGraph -from xtc.graphs.xtc.data import XTCTensor -from xtc.graphs.xtc.expr import XTCTensorExpr - -from .GPUEvaluator import GPUExecutor, GPUEvaluator -from xtc.targets.host import HostModule - - -__all__ = [ - "GPUModule", -] - - -class GPUModule(HostModule): - def __init__( - self, - name: str, - payload_name: str, - file_name: str, - file_type: str, - graph: Graph | None = None, - **kwargs: Any, - ) -> None: - super().__init__(name, payload_name, file_name, file_type, graph, **kwargs) - - @override - def get_evaluator(self, **kwargs: Any) -> itf.exec.Evaluator: - return GPUEvaluator( - self, - **kwargs, - ) - - @override - def get_executor(self, **kwargs: Any) -> itf.exec.Executor: - return GPUExecutor( - self, - **kwargs, - ) diff --git a/src/xtc/targets/host/HostEvaluator.py b/src/xtc/targets/host/HostEvaluator.py index cae767d7a..c0f9b1af7 100644 --- a/src/xtc/targets/host/HostEvaluator.py +++ b/src/xtc/targets/host/HostEvaluator.py @@ -5,16 +5,25 @@ from typing import Any from typing_extensions import override import numpy as np +from pathlib import Path from xtc.runtimes.types.ndarray import NDArray from xtc.utils.numpy import ( np_init, ) -from xtc.runtimes.host.evaluate import load_and_evaluate + +from xtc.runtimes.host.HostRuntime import HostRuntime import xtc.itf as itf import xtc.targets.host as host +from xtc.utils.loader import LibLoader +from xtc.utils.evaluation import ( + ensure_ndarray_parameters, + validate_outputs, + evaluate_performance, + copy_outputs, +) __all__ = [ "HostEvaluator", @@ -40,78 +49,53 @@ def __init__(self, module: "host.HostModule", **kwargs: Any) -> None: self._reference_impl = kwargs.get( "reference_impl", self._module._reference_impl ) - self._register_buffer_fn = kwargs.get("register_buffer_fn", None) - self._unregister_buffer_fn = kwargs.get("unregister_buffer_fn", None) self._pmu_counters = kwargs.get("pmu_counters", []) - self._runtime = kwargs.get("runtime", None) - if self._runtime is None: - import xtc.runtimes.host.runtime as host_runtime - - self._runtime = host_runtime + self._runtime = kwargs.get("runtime", HostRuntime()) assert self._module.file_type == "shlib", "only support shlib for evaluation" @override def evaluate(self) -> tuple[list[float], int, str]: - if self._parameters is None: - assert self._np_inputs_spec is not None - assert self._np_outputs_spec is not None - inputs_spec = self._np_inputs_spec() - outputs_spec = self._np_outputs_spec() - out_init = np.zeros if self._init_zero else np.empty - inputs = [np_init(**spec) for spec in inputs_spec] - outputs = [out_init(**spec) for spec in outputs_spec] - parameters = ( - [NDArray(inp) for inp in inputs], - [NDArray(out) for out in outputs], - ) - else: - inputs, outputs = self._parameters - nd_inputs = [ - NDArray(inp) if isinstance(inp, np.ndarray) else inp for inp in inputs - ] - nd_outputs = [ - NDArray(out) if isinstance(out, np.ndarray) else out for out in outputs - ] - parameters = (nd_inputs, nd_outputs) - - ref_outputs = [] - if self._validate: - assert self._reference_impl is not None - ref_inputs = [inp.numpy() for inp in parameters[0]] - ref_outputs = [ - np.empty(shape=out.shape, dtype=out.dtype) for out in parameters[1] - ] - self._reference_impl(*ref_inputs, *ref_outputs) - - if self._register_buffer_fn is not None: - for buffer in parameters[0] + parameters[1]: - self._register_buffer_fn(buffer) - - results = load_and_evaluate( - runtime=self._runtime, - module_file=self._module.file_name, - module_name=self._module.name, - payload_name=self._module.payload_name, - bare_ptr=self._module._bare_ptr, - parameters=parameters, - validate=self._validate, - ref_outputs=ref_outputs, - repeat=self._repeat, - min_repeat_ms=self._min_repeat_ms, - number=self._number, - pmu_counters=self._pmu_counters, + # Load the module + dll = str(Path(self._module.file_name).absolute()) + lib = LibLoader(dll) + sym = self._module.payload_name + func = getattr(lib.lib, sym) + func.packed = not self._module._bare_ptr + results: tuple[list[float], int, str] = ([], 0, "") + validation_failed = False + + # Prepare the parameters + parameters = ensure_ndarray_parameters( + self._parameters, + self._np_inputs_spec, + self._np_outputs_spec, + self._init_zero, ) - if self._unregister_buffer_fn is not None: - for buffer in parameters[0] + parameters[1]: - self._unregister_buffer_fn(buffer) + # Check the correctness of the outputs + if self._validate: + results = validate_outputs(func, parameters, self._reference_impl) + validation_failed = results[1] != 0 + + # Measure the performance + if not validation_failed: + assert self._runtime is not None + results = evaluate_performance( + func, + parameters, + self._pmu_counters, + self._repeat, + self._number, + self._min_repeat_ms, + self._runtime, + ) + + # Unload the module + lib.close() + # Copy out outputs if self._parameters is not None: - _, outputs = self._parameters - _, outputs_copy = parameters - for out, out_copy in zip(outputs, outputs_copy): - if isinstance(out, np.ndarray): - out_copy.numpy(out=out) + copy_outputs(parameters, self._parameters) return results diff --git a/src/xtc/targets/host/HostModule.py b/src/xtc/targets/host/HostModule.py index a3b1cbd01..bf3c44870 100644 --- a/src/xtc/targets/host/HostModule.py +++ b/src/xtc/targets/host/HostModule.py @@ -5,13 +5,13 @@ from typing import Any, cast from typing_extensions import override -import sys - import xtc.itf as itf from xtc.itf.graph import Graph -from xtc.graphs.xtc.graph import XTCGraph -from xtc.graphs.xtc.data import XTCTensor -from xtc.graphs.xtc.expr import XTCTensorExpr +from xtc.utils.evaluation import ( + graph_np_inputs_spec, + graph_np_outputs_spec, + graph_reference_impl, +) from .HostEvaluator import HostExecutor, HostEvaluator @@ -40,59 +40,17 @@ def __init__( assert self._file_name.endswith(lib_suffixes), ( f"file name {self._file_name} is not a shlib" ) - self._bare_ptr = kwargs.get("bare_ptr", True) self._graph = graph if self._graph is not None: - self._np_inputs_spec = self._graph_np_inputs_spec - self._np_outputs_spec = self._graph_np_outputs_spec - self._reference_impl = self._graph_reference_impl + self._np_inputs_spec = graph_np_inputs_spec(self._graph) + self._np_outputs_spec = graph_np_outputs_spec(self._graph) + self._reference_impl = graph_reference_impl(self._graph) else: self._np_inputs_spec = kwargs.get("np_inputs_spec") self._np_outputs_spec = kwargs.get("np_outputs_spec") self._reference_impl = kwargs.get("reference_impl") - def _graph_np_inputs_spec(self) -> list[dict[str, Any]]: - assert isinstance(self._graph, XTCGraph) - assert all( - [ - isinstance(node._expr, XTCTensorExpr) and node._expr.type.is_constant() - for node in self._graph.inputs_nodes - ] - ), f"graph inputs are not tensors" - inputs_types = [ - cast(XTCTensorExpr, node._expr).type for node in self._graph.inputs_nodes - ] - return [ - { - "shape": type.constant_shape, - "dtype": type.constant_dtype, - } - for type in inputs_types - ] - - def _graph_np_outputs_spec(self) -> list[dict[str, Any]]: - assert isinstance(self._graph, XTCGraph) - assert all( - [node._outputs_types is not None for node in self._graph.outputs_nodes] - ), f"graph types were not forwarded" - return [ - { - "shape": type.constant_shape, - "dtype": type.constant_dtype, - } - for type in [ - cast(list, node._outputs_types)[0] for node in self._graph.outputs_nodes - ] - ] - - def _graph_reference_impl(self, *args: Any) -> None: - assert self._graph is not None - inputs = [XTCTensor(inp) for inp in args[: len(self._graph.inputs)]] - outputs = self._graph.forward(inputs) - for idx, out in enumerate(args[len(self._graph.inputs) :]): - out[:] = outputs[idx].numpy() - @property @override def file_type(self) -> str: diff --git a/src/xtc/utils/cfunc.py b/src/xtc/utils/cfunc.py new file mode 100644 index 000000000..bc5ee41be --- /dev/null +++ b/src/xtc/utils/cfunc.py @@ -0,0 +1,121 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing import Any +import ctypes + +__all__ = [ + "CFunc", + "CArgValue", + "CArgCode", + "CRetValue", + "CPackedFunc", + "_c_ascii_str", + "_str_list_to_c", +] + + +class ArgTypeCode: + INT = 0 + HANDLE = 3 + NDARRAY_HANDLE = 13 + + +CArgCode = ctypes.c_int + + +class CArgValue(ctypes.Union): + _fields_ = [ + ("v_int64", ctypes.c_int64), + ("v_float64", ctypes.c_double), + ("v_handle", ctypes.c_void_p), + ("v_str", ctypes.c_char_p), + ] + + +class CRetValue(CArgValue): + pass + + +CPackedFunc = ctypes.CFUNCTYPE( + ctypes.c_int, + ctypes.POINTER(CArgValue), + ctypes.POINTER(CArgCode), + ctypes.c_int, + ctypes.POINTER(CRetValue), + ctypes.POINTER(CArgCode), +) + + +class CFunc: + def __init__(self, f: Any, packed: bool = False) -> None: + self.handle = f + self.is_packed = packed or ( + hasattr(self.handle, "packed") and self.handle.packed + ) + + def arg_tuple(self, arg: Any) -> Any: + if arg.__class__.__name__ == "ndarray": # Numpy Array + assert not self.is_packed + return (arg.ctypes.data_as(ctypes.c_voidp), ArgTypeCode.HANDLE) + elif arg.__class__.__name__ == "NDArray": # TVM NDArray or our NDArray + if ( + hasattr(arg, "is_on_device") and arg.is_on_device() + ): # Device living NDArray + if self.is_packed: + raise RuntimeError("TODO: device NDArray not supported yet") + else: + return ( + ctypes.cast(arg.data, ctypes.c_void_p), + ArgTypeCode.HANDLE, + ) + if self.is_packed: + return ( + CArgValue(v_handle=ctypes.cast(arg.handle, ctypes.c_void_p)), + ArgTypeCode.NDARRAY_HANDLE, + ) + else: + return ( + ctypes.cast(arg.handle.contents.dl_tensor.data, ctypes.c_void_p), + ArgTypeCode.HANDLE, + ) + else: + assert 0, f"Unsupported argument class: {arg.__class__.__name__}" + + def args_tuples(self, args: Any) -> list[Any]: + return [self.arg_tuple(arg) for arg in args] + + def __call__(self, *args: Any): + args_tuples = self.args_tuples(args) + if self.is_packed: + args_array = (CArgValue * len(args_tuples))( + *[arg[0] for arg in args_tuples] + ) + args_codes = (CArgCode * len(args_tuples))(*[arg[1] for arg in args_tuples]) + result_val = CRetValue(0) + result_code = CArgCode(ArgTypeCode.INT) + res = CPackedFunc(self.handle)( + args_array, + args_codes, + len(args_tuples), + ctypes.byref(result_val), + ctypes.byref(result_code), + ctypes.c_int(len(args_tuples)), + ) + assert res == 0, f"error calling packed function" + else: + data_args = [arg[0] for arg in args_tuples] + self.handle(*data_args) + + +class _c_ascii_str: + @staticmethod + def from_param(obj: str | bytes): + if isinstance(obj, str): + obj = obj.encode("ascii") + return ctypes.c_char_p.from_param(obj) + + +def _str_list_to_c(str_list: list[str]) -> Any: + return (ctypes.c_char_p * len(str_list))(*[str.encode("utf-8") for str in str_list]) diff --git a/src/xtc/utils/evaluation.py b/src/xtc/utils/evaluation.py new file mode 100644 index 000000000..e5deba670 --- /dev/null +++ b/src/xtc/utils/evaluation.py @@ -0,0 +1,194 @@ +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024-2026 The XTC Project Authors +# +from typing import Any, Callable, cast +from xtc.itf.graph import Graph +import ctypes +import numpy as np +from xtc.utils.numpy import np_init +from xtc.runtimes.types.ndarray import NDArray +from xtc.graphs.xtc.graph import XTCGraph +from xtc.graphs.xtc.expr import XTCTensorExpr +from xtc.graphs.xtc.data import XTCTensor +from xtc.utils.cfunc import CFunc, CArgValue, CArgCode +from xtc.itf.runtime.common import CommonRuntimeInterface +from xtc.runtimes.host.HostRuntime import HostRuntime + +__all__ = [] + + +def graph_np_inputs_spec(graph: Graph) -> Callable[[], list[dict[str, Any]]]: + assert isinstance(graph, XTCGraph) + assert all( + [ + isinstance(node._expr, XTCTensorExpr) and node._expr.type.is_constant() + for node in graph.inputs_nodes + ] + ), f"graph inputs are not tensors" + + def _graph_np_inputs_spec() -> list[dict[str, Any]]: + inputs_types = [ + cast(XTCTensorExpr, node._expr).type for node in graph.inputs_nodes + ] + return [ + { + "shape": type.constant_shape, + "dtype": type.constant_dtype, + "device": type.device, + } + for type in inputs_types + ] + + return _graph_np_inputs_spec + + +def graph_np_outputs_spec(graph: Graph) -> Callable[[], list[dict[str, Any]]]: + assert isinstance(graph, XTCGraph) + assert all([node._outputs_types is not None for node in graph.outputs_nodes]), ( + f"graph types were not forwarded" + ) + + def _graph_np_outputs_spec() -> list[dict[str, Any]]: + return [ + { + "shape": type.constant_shape, + "dtype": type.constant_dtype, + "device": type.device, + } + for type in [ + cast(list, node._outputs_types)[0] for node in graph.outputs_nodes + ] + ] + + return _graph_np_outputs_spec + + +def graph_reference_impl(graph: Graph) -> Callable[[], None]: + def _graph_reference_impl(*args: Any) -> None: + inputs = [XTCTensor(inp) for inp in args[: len(graph.inputs)]] + outputs = graph.forward(inputs) + for idx, out in enumerate(args[len(graph.inputs) :]): + out[:] = outputs[idx].numpy() + + return _graph_reference_impl + + +def ensure_ndarray_parameters( + parameters: tuple[Any, Any] | None, + np_inputs_spec: Callable[[], list[dict[str, Any]]] | None, + np_outputs_spec: Callable[[], list[dict[str, Any]]] | None, + init_zero: bool = False, +) -> tuple[list[NDArray], list[NDArray]]: + if parameters is None: + assert np_inputs_spec is not None + assert np_outputs_spec is not None + inputs_spec = np_inputs_spec() + outputs_spec = np_outputs_spec() + out_init = np.zeros if init_zero else np.empty + inputs = [ + (np_init(**spec), spec["device"] if "device" in spec else HostRuntime.get()) + for spec in inputs_spec + ] + outputs = [ + out_init(**{k: v for k, v in spec.items() if k != "device"}) + for spec in outputs_spec + ] + parameters = ( + [NDArray(*inp) for inp in inputs], + [ + NDArray( + out, + runtime=spec["device"] if "device" in spec else HostRuntime.get(), + ) + for out, spec in zip(outputs, outputs_spec) + ], + ) + else: + inputs, outputs = parameters + nd_inputs = [ + NDArray(inp) if isinstance(inp, np.ndarray) else inp for inp in inputs + ] + nd_outputs = [ + NDArray(out) if isinstance(out, np.ndarray) else out for out in outputs + ] + parameters = (nd_inputs, nd_outputs) + return parameters + + +def validate_outputs( + func: Callable[[Any], Any], + parameters: tuple[list[NDArray], list[NDArray]], + reference_impl: Callable[[], None], +) -> tuple[list[float], int, str]: + # Get the reference outputs + assert reference_impl is not None + ref_inputs = [inp.numpy() for inp in parameters[0]] + ref_outputs = [np.empty(shape=out.shape, dtype=out.dtype) for out in parameters[1]] + reference_impl(*ref_inputs, *ref_outputs) + # Get the function outputs + CFunc(func)(*parameters[0], *parameters[1]) + # Compare + for out_ref, out in zip(ref_outputs, [out.numpy() for out in parameters[1]]): + if not np.allclose(out_ref, out): + return ([], 1, "Error in validation: outputs differ") + return ([], 0, "") + + +def evaluate_performance( + func: Callable[[Any], Any], + parameters: tuple[list[NDArray], list[NDArray]], + pmu_counters: list[str], + repeat: int, + number: int, + min_repeat_ms: int, + runtime: CommonRuntimeInterface | Any, +) -> tuple[list[float], int, str]: + # TODO migrate host runtime to CommonRuntimeInterface + cfunc = CFunc(func) + args_tuples = cfunc.args_tuples([*parameters[0], *parameters[1]]) + values_num = 1 + if len(pmu_counters) > 0: + values_num = len(pmu_counters) + # FIXME check if the PMU counters are supported by the target + results_array = (ctypes.c_double * (repeat * values_num))() + if cfunc.is_packed: + args_array_packed = (CArgValue * len(args_tuples))( + *[arg[0] for arg in args_tuples] + ) + args_codes_packed = (CArgCode * len(args_tuples))( + *[arg[1] for arg in args_tuples] + ) + runtime.evaluate_packed_perf( + results_array, + pmu_counters, + repeat, + number, + min_repeat_ms, + cfunc, + args_array_packed, + args_codes_packed, + len(args_tuples), + ) + eval_results = [float(x) for x in results_array] + else: + eval_results = runtime.evaluate_perf( + pmu_counters, + repeat, + number, + min_repeat_ms, + cfunc, + args_tuples, + ) + return (eval_results, 0, "") + + +def copy_outputs( + parameters: tuple[list[NDArray], list[NDArray]], + target_parameters: tuple[list[NDArray], list[NDArray]], +) -> None: + _, outputs = target_parameters + _, outputs_copy = parameters + for out, out_copy in zip(outputs, outputs_copy): + if isinstance(out, np.ndarray): + out_copy.numpy(out=out) diff --git a/src/xtc/utils/numpy.py b/src/xtc/utils/numpy.py index 500b6483d..5e85a978b 100644 --- a/src/xtc/utils/numpy.py +++ b/src/xtc/utils/numpy.py @@ -9,7 +9,7 @@ from .math import mulall -def np_init(shape: tuple, dtype: str) -> numpy.typing.NDArray[Any]: +def np_init(shape: tuple, dtype: str, **attrs: Any) -> numpy.typing.NDArray[Any]: """ Initialize and return a NP array filled with numbers in [1, 9]. diff --git a/tests/filecheck/backends/target_gpu/test_matmul_mlir_offload_tensor.py b/tests/filecheck/backends/target_gpu/test_matmul_mlir_offload_tensor.py new file mode 100644 index 000000000..c28c81342 --- /dev/null +++ b/tests/filecheck/backends/target_gpu/test_matmul_mlir_offload_tensor.py @@ -0,0 +1,161 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: mlir-target=nvgpu + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend + +from xtc.runtimes.accelerator.gpu import GPUDevice + +# Create device +gpu = GPUDevice() + +I, J, K, dtype = 4, 32, 512, "float32" +a = O.tensor((I, K), dtype, name="A") # A live son the host +b = O.tensor((K, J), dtype, name="B", device=gpu) # B lives on the accelerator + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C", device=gpu) # C lives on the accelerator + +graph = gb.graph +print(graph) + +impl = Backend(graph) + +sch = impl.get_scheduler() +sch.tile("i", {"i1": 2}) +sch.tile("j", {"j1": 16}) +sch.unroll({"i1": 2}) +sch.parallelize(["i"]) +sched = sch.schedule() + +comp = impl.get_compiler( + target=gpu, + shared_lib=True, + dump_file="gpu_matmul_mlir_offload_tensor", + print_source_ir=True, + print_transformed_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x32xf32> {llvm.noalias, memref.on_device}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>) +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_op, %forall_op = transform.structured.tile_using_forall %1 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %forall_op "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./k" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./i1" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./j1" : !transform.any_op +# CHECK-NEXT: transform.loop.unroll %loops_7 {factor = 2 : i64} : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)> +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x32xf32> {llvm.noalias, memref.on_device}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c1_1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_0 to %c32 step %c1_1 { +# CHECK-NEXT: %subview_2 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.forall (%arg3) in (2) { +# CHECK-NEXT: %0 = affine.apply #map(%arg3) +# CHECK-NEXT: %subview = memref.subview %arg0[%0, 0] [2, 512] [1, 1] : memref<4x512xf32> to memref<2x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>> +# CHECK-NEXT: %subview_1 = memref.subview %arg2[%0, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c32 = arith.constant 32 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: scf.for %arg4 = %c0_2 to %c32 step %c16 { +# CHECK-NEXT: %subview_3 = memref.subview %subview[0, 0] [2, 512] [1, 1] : memref<2x512xf32, strided<[512, 1], offset: ?>> to memref<2x512xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_4 = memref.subview %subview_0[0, %arg4] [512, 16] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_5 = memref.subview %subview_1[0, %arg4] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_6 = arith.constant 0 : index +# CHECK-NEXT: %c512 = arith.constant 512 : index +# CHECK-NEXT: %c1_7 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_6 to %c512 step %c1_7 { +# CHECK-NEXT: %subview_8 = memref.subview %subview_3[0, %arg5] [2, 1] [1, 1] : memref<2x512xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_9 = memref.subview %subview_4[%arg5, 0] [1, 16] [1, 1] : memref<512x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_10 = memref.subview %subview_5[0, 0] [2, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_11 = arith.constant 0 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: %c1_12 = arith.constant 1 : index +# CHECK-NEXT: %c2_13 = arith.constant 2 : index +# CHECK-NEXT: %subview_14 = memref.subview %subview_8[%c0_11, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_15 = memref.subview %subview_9[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_16 = memref.subview %subview_10[%c0_11, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_17 = arith.constant 0 : index +# CHECK-NEXT: %c16_18 = arith.constant 16 : index +# CHECK-NEXT: %c1_19 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_17 to %c16_18 step %c1_19 { +# CHECK-NEXT: %subview_27 = memref.subview %subview_14[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_28 = memref.subview %subview_15[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_16[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j1"} +# CHECK-NEXT: %c1_20 = arith.constant 1 : index +# CHECK-NEXT: %1 = arith.muli %c1_12, %c1_20 : index +# CHECK-NEXT: %2 = arith.addi %c0_11, %1 : index +# CHECK-NEXT: %subview_21 = memref.subview %subview_8[%2, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_22 = memref.subview %subview_9[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_23 = memref.subview %subview_10[%2, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %c0_24 = arith.constant 0 : index +# CHECK-NEXT: %c16_25 = arith.constant 16 : index +# CHECK-NEXT: %c1_26 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_24 to %c16_25 step %c1_26 { +# CHECK-NEXT: %subview_27 = memref.subview %subview_21[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>> +# CHECK-NEXT: %subview_28 = memref.subview %subview_22[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: %subview_29 = memref.subview %subview_23[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>) +# CHECK-NEXT: } {"./j1"} +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x512xfloat32 +# CHECK-NEXT: - %1 : 512x32xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 4x32xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/target_mppa/test_matmul_mlir_micro_kernel.py b/tests/filecheck/backends/target_mppa/test_matmul_mlir_micro_kernel.py new file mode 100644 index 000000000..665867b16 --- /dev/null +++ b/tests/filecheck/backends/target_mppa/test_matmul_mlir_micro_kernel.py @@ -0,0 +1,244 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir_mppa +# REQUIRES: mlir-target=mppa + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend + +from xtc.runtimes.accelerator.mppa import MppaDevice + +I, J, K, dtype = 16, 16, 64, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C") + +graph = gb.graph +print(graph) + +impl = Backend(graph) + +sch = impl.get_scheduler() +sch.define_memory_mesh(axes={"mx": 1, "my": 1}) +sch.define_processor_mesh(axes={"px": 1, "py": 1, "psx": 1, "psy": 1}) +sch.tile("i", {"i1": 8}) +sch.tile("j", {"j1": 8}) +sch.interchange(["i", "j", "i1", "j1", "k"]) +sch.vectorize(["i1", "j1", "k"]) +#sch.pack_at("i1", 1) +sched = sch.schedule() + +# Create mppa device +mppa = MppaDevice() + +comp = impl.get_compiler( + target=mppa, + shared_lib=True, + dump_file="matmul_mlir_mppa", + print_source_ir=True, + print_transformed_ir=True, + print_lowered_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<16x64xf32> {llvm.noalias}, %arg1: memref<64x16xf32> {llvm.noalias}, %arg2: memref<16x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<16x16xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<16x64xf32>, memref<64x16xf32>) outs(%arg2 : memref<16x16xf32>) +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=1, "my"=1]> : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=1, "py"=1, "psx"=1, "psy"=1]> from "memory_mesh" : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [8, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: transform.annotate %tiled_linalg_op_4 "xtc.request_vectorization" : !transform.any_op +# CHECK-NEXT: %4 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %4 { +# CHECK-NEXT: transform.apply_patterns.vector.reduction_to_contract +# CHECK-NEXT: transform.apply_patterns.vector.transfer_permutation_patterns +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %4 { +# CHECK-NEXT: transform.apply_patterns.vector.lower_outerproduct +# CHECK-NEXT: transform.apply_patterns.vector.lower_contraction +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: sdist.processor_mesh @processor_mesh from @memory_mesh = <["px"=1, "py"=1, "psx"=1, "psy"=1]> +# CHECK-NEXT: sdist.memory_mesh @memory_mesh = <["mx"=1, "my"=1]> +# CHECK-NEXT: func.func @matmul(%arg0: memref<16x64xf32> {llvm.noalias}, %arg1: memref<64x16xf32> {llvm.noalias}, %arg2: memref<16x16xf32> {llvm.noalias}) { +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c16 step %c1 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.for %arg4 = %c0 to %c16 step %c1 { +# CHECK-NEXT: %subview_0 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_0 : memref<1x1xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.for %arg3 = %c0 to %c16 step %c8 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [8, 64] [1, 1] : memref<16x64xf32> to memref<8x64xf32, strided<[64, 1], offset: ?>> +# CHECK-NEXT: %subview_0 = memref.subview %arg1[0, 0] [64, 16] [1, 1] : memref<64x16xf32> to memref<64x16xf32, strided<[16, 1]>> +# CHECK-NEXT: %subview_1 = memref.subview %arg2[%arg3, 0] [8, 16] [1, 1] : memref<16x16xf32> to memref<8x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: scf.for %arg4 = %c0 to %c16 step %c8 { +# CHECK-NEXT: %subview_2 = memref.subview %subview_0[0, %arg4] [64, 8] [1, 1] : memref<64x16xf32, strided<[16, 1]>> to memref<64x8xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_3 = memref.subview %subview_1[0, %arg4] [8, 8] [1, 1] : memref<8x16xf32, strided<[16, 1], offset: ?>> to memref<8x8xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_, xtc.request_vectorization} ins(%subview, %subview_2 : memref<8x64xf32, strided<[64, 1], offset: ?>>, memref<64x8xf32, strided<[16, 1], offset: ?>>) outs(%subview_3 : memref<8x8xf32, strided<[16, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After MLIR Opt //----- // +# CHECK-NEXT: #map = affine_map<(d0, d1, d2) -> (d0, d2)> +# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +# CHECK-NEXT: "builtin.module"() ({ +# CHECK-NEXT: "func.func"() <{arg_attrs = [{llvm.noalias}, {llvm.noalias}, {llvm.noalias}], function_type = (memref<16x64xf32>, memref<64x16xf32>, memref<16x16xf32>) -> (), sym_name = "matmul"}> ({ +# CHECK-NEXT: ^bb0(%arg0: memref<16x64xf32>, %arg1: memref<64x16xf32>, %arg2: memref<16x16xf32>): +# CHECK-NEXT: "mppa.launch"() ({ +# CHECK-NEXT: "kvxcluster.launch"() ({ +# CHECK-NEXT: ^bb0(%arg3: index): +# CHECK-NEXT: %0 = "arith.constant"() <{value = 1 : index}> : () -> index +# CHECK-NEXT: %1 = "arith.constant"() <{value = 16 : index}> : () -> index +# CHECK-NEXT: %2 = "arith.constant"() <{value = 0 : index}> : () -> index +# CHECK-NEXT: %3 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32 +# CHECK-NEXT: %4 = "arith.constant"() <{value = 8 : index}> : () -> index +# CHECK-NEXT: "scf.for"(%2, %1, %0) ({ +# CHECK-NEXT: ^bb0(%arg9: index): +# CHECK-NEXT: %11 = "memref.subview"(%arg2, %arg9) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<16x16xf32>, index) -> memref<1x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: "scf.for"(%2, %1, %0) ({ +# CHECK-NEXT: ^bb0(%arg10: index): +# CHECK-NEXT: %12 = "memref.subview"(%11, %arg10) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1x16xf32, strided<[16, 1], offset: ?>>, index) -> memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: "linalg.fill"(%3, %12) <{operandSegmentSizes = array}> ({ +# CHECK-NEXT: ^bb0(%arg11: f32, %arg12: f32): +# CHECK-NEXT: "linalg.yield"(%arg11) : (f32) -> () +# CHECK-NEXT: }) {__xtc_id_C_0_} : (f32, memref<1x1xf32, strided<[16, 1], offset: ?>>) -> () +# CHECK-NEXT: "scf.yield"() : () -> () +# CHECK-NEXT: }) {"./j"} : (index, index, index) -> () +# CHECK-NEXT: "scf.yield"() : () -> () +# CHECK-NEXT: }) {"./i"} : (index, index, index) -> () +# CHECK-NEXT: "scf.for"(%2, %1, %4) ({ +# CHECK-NEXT: ^bb0(%arg4: index): +# CHECK-NEXT: %5 = "memref.subview"(%arg0, %arg4) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<16x64xf32>, index) -> memref<8x64xf32, strided<[64, 1], offset: ?>> +# CHECK-NEXT: %6 = "memref.subview"(%arg2, %arg4) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<16x16xf32>, index) -> memref<8x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: "scf.for"(%2, %1, %4) ({ +# CHECK-NEXT: ^bb0(%arg5: index): +# CHECK-NEXT: %7 = "memref.subview"(%arg1, %arg5) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<64x16xf32>, index) -> memref<64x8xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %8 = "memref.subview"(%6, %arg5) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<8x16xf32, strided<[16, 1], offset: ?>>, index) -> memref<8x8xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: "linalg.matmul"(%5, %7, %8) <{indexing_maps = [#map, #map1, #map2], operandSegmentSizes = array}> ({ +# CHECK-NEXT: ^bb0(%arg6: f32, %arg7: f32, %arg8: f32): +# CHECK-NEXT: %9 = "arith.mulf"(%arg6, %arg7) <{fastmath = #arith.fastmath}> : (f32, f32) -> f32 +# CHECK-NEXT: %10 = "arith.addf"(%arg8, %9) <{fastmath = #arith.fastmath}> : (f32, f32) -> f32 +# CHECK-NEXT: "linalg.yield"(%10) : (f32) -> () +# CHECK-NEXT: }) {__xtc_id_C_, xtc.request_vectorization} : (memref<8x64xf32, strided<[64, 1], offset: ?>>, memref<64x8xf32, strided<[16, 1], offset: ?>>, memref<8x8xf32, strided<[16, 1], offset: ?>>) -> () +# CHECK-NEXT: "scf.yield"() : () -> () +# CHECK-NEXT: }) {"./j"} : (index, index, index) -> () +# CHECK-NEXT: "scf.yield"() : () -> () +# CHECK-NEXT: }) {"./i"} : (index, index, index) -> () +# CHECK-NEXT: "kvxcluster.launch_terminator"() : () -> () +# CHECK-NEXT: }) {mask = 1 : i32, nclusters = 1 : i32} : () -> () +# CHECK-NEXT: "kvxcluster.await_all"() : () -> () +# CHECK-NEXT: "mppa.yield"() : () -> () +# CHECK-NEXT: }) {device = 1 : i32} : () -> () +# CHECK-NEXT: "func.return"() : () -> () +# CHECK-NEXT: }) : () -> () +# CHECK-NEXT: }) {transform.with_named_sequence} : () -> () +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After MPPA Opt //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @kvxcluster_launch_0_kernel_cc_0(%arg0: memref<16x16xf32, 2>, %arg1: memref<16x64xf32, 2>, %arg2: memref<64x16xf32, 2>) attributes {kernel_for_cluster_id = 0 : index} { +# CHECK-NEXT: %c64 = arith.constant 64 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c16 step %c1 { +# CHECK-NEXT: scf.for %arg4 = %c0 to %c16 step %c1 { +# CHECK-NEXT: %0 = arith.muli %arg3, %c16 overflow : index +# CHECK-NEXT: %1 = arith.addi %0, %arg4 : index +# CHECK-NEXT: %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1, 1], strides: [16, 1] : memref<16x16xf32, 2> to memref<1x1xf32, strided<[16, 1], offset: ?>, 2> +# CHECK-NEXT: kvxpe.launch %arg5 (npes=1) { +# CHECK-NEXT: memref.store %cst, %reinterpret_cast[%c0, %c0] : memref<1x1xf32, strided<[16, 1], offset: ?>, 2> +# CHECK-NEXT: kvxpe.launch_terminator +# CHECK-NEXT: } +# CHECK-NEXT: kvxpe.await_all +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: scf.for %arg3 = %c0 to %c16 step %c8 { +# CHECK-NEXT: %0 = arith.muli %arg3, %c64 overflow : index +# CHECK-NEXT: %reinterpret_cast = memref.reinterpret_cast %arg1 to offset: [%0], sizes: [8, 64], strides: [64, 1] : memref<16x64xf32, 2> to memref<8x64xf32, strided<[64, 1], offset: ?>, 2> +# CHECK-NEXT: scf.for %arg4 = %c0 to %c16 step %c8 { +# CHECK-NEXT: %reinterpret_cast_0 = memref.reinterpret_cast %arg2 to offset: [%arg4], sizes: [64, 8], strides: [16, 1] : memref<64x16xf32, 2> to memref<64x8xf32, strided<[16, 1], offset: ?>, 2> +# CHECK-NEXT: %1 = arith.muli %arg3, %c16 overflow : index +# CHECK-NEXT: %2 = arith.addi %1, %arg4 : index +# CHECK-NEXT: %reinterpret_cast_1 = memref.reinterpret_cast %arg0 to offset: [%2], sizes: [8, 8], strides: [16, 1] : memref<16x16xf32, 2> to memref<8x8xf32, strided<[16, 1], offset: ?>, 2> +# CHECK-NEXT: kvxpe.launch %arg5 (npes=1) { +# CHECK-NEXT: kvxuks.mma_8x8xf32 %reinterpret_cast, %reinterpret_cast_0 -> %reinterpret_cast_1 : memref<8x64xf32, strided<[64, 1], offset: ?>, 2>, memref<64x8xf32, strided<[16, 1], offset: ?>, 2>, memref<8x8xf32, strided<[16, 1], offset: ?>, 2> +# CHECK-NEXT: kvxpe.launch_terminator +# CHECK-NEXT: } +# CHECK-NEXT: kvxpe.await_all +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: func.func @matmul(%arg0: memref<16x64xf32> {llvm.noalias}, %arg1: memref<64x16xf32> {llvm.noalias}, %arg2: memref<16x16xf32> {llvm.noalias}) { +# CHECK-NEXT: mppa.launch(k300) { +# CHECK-NEXT: %0 = mppa.alloc : memref<16x16xf32, 2> +# CHECK-NEXT: mppa.copy %arg2, %0 : memref<16x16xf32> to memref<16x16xf32, 2> +# CHECK-NEXT: %1 = mppa.alloc : memref<16x64xf32, 2> +# CHECK-NEXT: mppa.copy %arg0, %1 : memref<16x64xf32> to memref<16x64xf32, 2> +# CHECK-NEXT: %2 = mppa.alloc : memref<64x16xf32, 2> +# CHECK-NEXT: mppa.copy %arg1, %2 : memref<64x16xf32> to memref<64x16xf32, 2> +# CHECK-NEXT: kvxcluster.launch (nclusters=1, mask=1) +# CHECK-NEXT: 0 -> @kvxcluster_launch_0_kernel_cc_0 +# CHECK-NEXT: with (%0, %1, %2) : memref<16x16xf32, 2>, memref<16x64xf32, 2>, memref<64x16xf32, 2> +# CHECK-NEXT: kvxcluster.await_all +# CHECK-NEXT: mppa.dealloc %2 : memref<64x16xf32, 2> +# CHECK-NEXT: mppa.dealloc %1 : memref<16x64xf32, 2> +# CHECK-NEXT: mppa.copy %0, %arg2 : memref<16x16xf32, 2> to memref<16x16xf32> +# CHECK-NEXT: mppa.dealloc %0 : memref<16x16xf32, 2> +# CHECK-NEXT: kvxcluster.await_all +# CHECK-NEXT: } +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 16x64xfloat32 +# CHECK-NEXT: - %1 : 64x16xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 16x16xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [16x64xfloat32, 64x16xfloat32] -> [16x16xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/target_mppa/test_matmul_mlir_mppa.py b/tests/filecheck/backends/target_mppa/test_matmul_mlir_mppa.py new file mode 100644 index 000000000..f3115e667 --- /dev/null +++ b/tests/filecheck/backends/target_mppa/test_matmul_mlir_mppa.py @@ -0,0 +1,149 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir_mppa +# REQUIRES: mlir-target=mppa + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend + +from xtc.runtimes.accelerator.mppa import MppaDevice + +I, J, K, dtype = 4, 8, 16, "float32" +a = O.tensor((I, K), dtype, name="A") +b = O.tensor((K, J), dtype, name="B") + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C") + +graph = gb.graph +print(graph) + +impl = Backend(graph) + +sch = impl.get_scheduler() +sch.define_memory_mesh(axes={"mx": 1, "my": 1}) +sch.define_processor_mesh(axes={"px": 1, "py": 1, "psx": 2, "psy": 8}) +sch.tile("i", {"i1": 2}) +sch.pack_at("i1", 1) +sched = sch.schedule() + +# Create mppa device +mppa = MppaDevice() + +comp = impl.get_compiler( + target=mppa, + shared_lib=True, + dump_file="matmul_mlir_mppa", + print_source_ir=True, + print_transformed_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias}, %arg2: memref<4x8xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x8xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x16xf32>, memref<16x8xf32>) outs(%arg2 : memref<4x8xf32>) +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=1, "my"=1]> : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=1, "py"=1, "psx"=2, "psy"=8]> from "memory_mesh" : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %tiled_linalg_op_6 { +# CHECK-NEXT: transform.apply_patterns.memref.fold_memref_alias_ops +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: %4 = transform.sdist.local_buffer_at %tiled_linalg_op_6 tensor 1 : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i1" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: sdist.processor_mesh @processor_mesh from @memory_mesh = <["px"=1, "py"=1, "psx"=2, "psy"=8]> +# CHECK-NEXT: sdist.memory_mesh @memory_mesh = <["mx"=1, "my"=1]> +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias}, %arg2: memref<4x8xf32> {llvm.noalias}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0] [1, 8] [1, 1] : memref<4x8xf32> to memref<1x8xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_2 to %c8 step %c1_3 { +# CHECK-NEXT: %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x8xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[8, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: scf.for %arg3 = %c0_0 to %c4_1 step %c2 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [2, 16] [1, 1] : memref<4x16xf32> to memref<2x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %arg1[0, 0] [16, 8] [1, 1] : memref<16x8xf32> to memref<16x8xf32, strided<[8, 1]>> +# CHECK-NEXT: %subview_3 = memref.subview %arg2[%arg3, 0] [2, 8] [1, 1] : memref<4x8xf32> to memref<2x8xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_4 to %c8 step %c1_5 { +# CHECK-NEXT: %subview_6 = memref.subview %subview[0, 0] [2, 16] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %subview_2[0, %arg4] [16, 1] [1, 1] : memref<16x8xf32, strided<[8, 1]>> to memref<16x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %subview_3[0, %arg4] [2, 1] [1, 1] : memref<2x8xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_9 to %c16 step %c1_10 { +# CHECK-NEXT: %subview_11 = memref.subview %subview_6[0, %arg5] [2, 1] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_8[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %alloc = memref.alloc() : memref<1x1xf32, 2> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: sdist.read %subview_7[%arg5, %c0_14] to %alloc : memref<16x1xf32, strided<[8, 1], offset: ?>>, memref<1x1xf32, 2> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c2_16 = arith.constant 2 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_15 to %c2_16 step %c1_17 { +# CHECK-NEXT: %subview_18 = memref.subview %subview_11[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_19 = memref.subview %alloc[0, 0] [1, 1] [1, 1] : memref<1x1xf32, 2> to memref<1x1xf32, strided<[1, 1]>, 2> +# CHECK-NEXT: %subview_20 = memref.subview %subview_13[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_18, %subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[1, 1]>, 2>) outs(%subview_20 : memref<1x1xf32, strided<[8, 1], offset: ?>>) +# CHECK-NEXT: } {"./i1"} +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x16xfloat32 +# CHECK-NEXT: - %1 : 16x8xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 4x8xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [4x16xfloat32, 16x8xfloat32] -> [4x8xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/backends/target_mppa/test_matmul_mlir_offload_tensor.py b/tests/filecheck/backends/target_mppa/test_matmul_mlir_offload_tensor.py new file mode 100644 index 000000000..677702c41 --- /dev/null +++ b/tests/filecheck/backends/target_mppa/test_matmul_mlir_offload_tensor.py @@ -0,0 +1,149 @@ +# RUN: python %s 2>&1 | filecheck %s +# REQUIRES: module_mlir_mppa +# REQUIRES: mlir-target=mppa + +import xtc.graphs.xtc.op as O +from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend + +from xtc.runtimes.accelerator.mppa import MppaDevice + +# Create device +mppa = MppaDevice() + +I, J, K, dtype = 4, 8, 16, "float32" +a = O.tensor((I, K), dtype, name="A") # A live son the host +b = O.tensor((K, J), dtype, name="B", device=mppa) # B lives on the accelerator + +with O.graph(name="matmul") as gb: + O.matmul(a, b, name="C", device=mppa) # C msut lives on the accelerator + +graph = gb.graph +print(graph) + +impl = Backend(graph) + +sch = impl.get_scheduler() +sch.define_memory_mesh(axes={"mx": 1, "my": 1}) +sch.define_processor_mesh(axes={"px": 1, "py": 1, "psx": 2, "psy": 8}) +sch.tile("i", {"i1": 2}) +sch.pack_at("i1", 1) +sched = sch.schedule() + +comp = impl.get_compiler( + target=mppa, + shared_lib=True, + dump_file="matmul_mlir_offload_tensor", + print_source_ir=True, + print_transformed_ir=True, +) +module = comp.compile(sched) +executor = module.get_executor(validate=True) +res = executor.execute() +print(f"CODE: {res}") +# CHECK: // -----// IR Dump Before transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x8xf32> {llvm.noalias, memref.on_device}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x8xf32>) +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x16xf32>, memref<16x8xf32>) outs(%arg2 : memref<4x8xf32>) +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) { +# CHECK-NEXT: transform.structured.vectorize %arg0 : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { +# CHECK-NEXT: %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=1, "my"=1]> : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=1, "py"=1, "psx"=2, "psy"=8]> from "memory_mesh" : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_1 "./j" : !transform.any_op +# CHECK-NEXT: %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_3 "./i" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_5 "./j" : !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_7 "./k" : !transform.any_op +# CHECK-NEXT: transform.apply_patterns to %tiled_linalg_op_6 { +# CHECK-NEXT: transform.apply_patterns.memref.fold_memref_alias_ops +# CHECK-NEXT: } : !transform.any_op +# CHECK-NEXT: %4 = transform.sdist.local_buffer_at %tiled_linalg_op_6 tensor 1 : !transform.any_op -> !transform.any_op +# CHECK-NEXT: %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +# CHECK-NEXT: transform.annotate %loops_9 "./i1" : !transform.any_op +# CHECK-NEXT: transform.yield +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: // -----// IR Dump After transform //----- // +# CHECK-NEXT: module attributes {transform.with_named_sequence} { +# CHECK-NEXT: sdist.processor_mesh @processor_mesh from @memory_mesh = <["px"=1, "py"=1, "psx"=2, "psy"=8]> +# CHECK-NEXT: sdist.memory_mesh @memory_mesh = <["mx"=1, "my"=1]> +# CHECK-NEXT: func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x8xf32> {llvm.noalias, memref.on_device}) { +# CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 +# CHECK-NEXT: %c0 = arith.constant 0 : index +# CHECK-NEXT: %c4 = arith.constant 4 : index +# CHECK-NEXT: %c1 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg3 = %c0 to %c4 step %c1 { +# CHECK-NEXT: %subview = memref.subview %arg2[%arg3, 0] [1, 8] [1, 1] : memref<4x8xf32> to memref<1x8xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %c0_2 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_3 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_2 to %c8 step %c1_3 { +# CHECK-NEXT: %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x8xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[8, 1], offset: ?>>) +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: %c0_0 = arith.constant 0 : index +# CHECK-NEXT: %c4_1 = arith.constant 4 : index +# CHECK-NEXT: %c2 = arith.constant 2 : index +# CHECK-NEXT: scf.for %arg3 = %c0_0 to %c4_1 step %c2 { +# CHECK-NEXT: %subview = memref.subview %arg0[%arg3, 0] [2, 16] [1, 1] : memref<4x16xf32> to memref<2x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_2 = memref.subview %arg1[0, 0] [16, 8] [1, 1] : memref<16x8xf32> to memref<16x8xf32, strided<[8, 1]>> +# CHECK-NEXT: %subview_3 = memref.subview %arg2[%arg3, 0] [2, 8] [1, 1] : memref<4x8xf32> to memref<2x8xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %c0_4 = arith.constant 0 : index +# CHECK-NEXT: %c8 = arith.constant 8 : index +# CHECK-NEXT: %c1_5 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg4 = %c0_4 to %c8 step %c1_5 { +# CHECK-NEXT: %subview_6 = memref.subview %subview[0, 0] [2, 16] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x16xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_7 = memref.subview %subview_2[0, %arg4] [16, 1] [1, 1] : memref<16x8xf32, strided<[8, 1]>> to memref<16x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %subview_8 = memref.subview %subview_3[0, %arg4] [2, 1] [1, 1] : memref<2x8xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %c0_9 = arith.constant 0 : index +# CHECK-NEXT: %c16 = arith.constant 16 : index +# CHECK-NEXT: %c1_10 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg5 = %c0_9 to %c16 step %c1_10 { +# CHECK-NEXT: %subview_11 = memref.subview %subview_6[0, %arg5] [2, 1] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_12 = memref.subview %subview_7[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %subview_13 = memref.subview %subview_8[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: %alloc = memref.alloc() : memref<1x1xf32, 2> +# CHECK-NEXT: %c0_14 = arith.constant 0 : index +# CHECK-NEXT: sdist.read %subview_7[%arg5, %c0_14] to %alloc : memref<16x1xf32, strided<[8, 1], offset: ?>>, memref<1x1xf32, 2> +# CHECK-NEXT: %c0_15 = arith.constant 0 : index +# CHECK-NEXT: %c2_16 = arith.constant 2 : index +# CHECK-NEXT: %c1_17 = arith.constant 1 : index +# CHECK-NEXT: scf.for %arg6 = %c0_15 to %c2_16 step %c1_17 { +# CHECK-NEXT: %subview_18 = memref.subview %subview_11[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>> +# CHECK-NEXT: %subview_19 = memref.subview %alloc[0, 0] [1, 1] [1, 1] : memref<1x1xf32, 2> to memref<1x1xf32, strided<[1, 1]>, 2> +# CHECK-NEXT: %subview_20 = memref.subview %subview_13[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>> +# CHECK-NEXT: linalg.matmul {__xtc_id_C_} ins(%subview_18, %subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[1, 1]>, 2>) outs(%subview_20 : memref<1x1xf32, strided<[8, 1], offset: ?>>) +# CHECK-NEXT: } {"./i1"} +# CHECK-NEXT: } {"./k"} +# CHECK-NEXT: } {"./j"} +# CHECK-NEXT: } {"./i"} +# CHECK-NEXT: return +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: +# CHECK-NEXT: graph: +# CHECK-NEXT: name: matmul +# CHECK-NEXT: inputs: +# CHECK-NEXT: - %0 : 4x16xfloat32 +# CHECK-NEXT: - %1 : 16x8xfloat32 +# CHECK-NEXT: outputs: +# CHECK-NEXT: - %2 : 4x8xfloat32 +# CHECK-NEXT: nodes: +# CHECK-NEXT: - %2: matmul(%0, %1) {name = 'C'} : [4x16xfloat32, 16x8xfloat32] -> [4x8xfloat32] +# CHECK-NEXT: +# CHECK-NEXT: CODE: 0 diff --git a/tests/filecheck/lit.cfg b/tests/filecheck/lit.cfg index 2743d0846..2b276d2c1 100644 --- a/tests/filecheck/lit.cfg +++ b/tests/filecheck/lit.cfg @@ -12,7 +12,7 @@ config.environment["TIMEOUT"] = "10" config.available_features = set() -for module in ["mlir", "tvm", "jir", "mlir_sdist"]: +for module in ["mlir", "tvm", "jir", "mlir_sdist", "mlir_mppa"]: if importlib.util.find_spec(module) is not None: config.available_features.add(f"module_{module}") else: @@ -28,6 +28,7 @@ env_passthrough = [ "PYTHONPATH", "XTC_MLIR_TARGET", "XTC_MLIR_PREFIX", + "KALRAY_TOOLCHAIN_DIR" ] config.environment.update({ var: os.environ[var]