diff --git a/Makefile b/Makefile
index 9ec20a8d8..78fe54df8 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,9 @@ check-lit-c:
 check-lit-nvgpu:
 	[ `uname -s` = Darwin ] || env XTC_MLIR_TARGET=nvgpu lit -v tests/filecheck/backends tests/filecheck/mlir_loop tests/filecheck/evaluation
 
+check-lit-mppa:
+	env XTC_MLIR_TARGET=mppa lit -v -j 1 tests/filecheck/backends/target_mppa
+
 check-pytest:
 	scripts/pytest/run_pytest.sh -v
 
diff --git a/sdist_requirements.txt b/sdist_requirements.txt
index 0cf8abdf4..b7fd6d145 100644
--- a/sdist_requirements.txt
+++ b/sdist_requirements.txt
@@ -1,4 +1,4 @@
 --index-url https://gitlab.inria.fr/api/v4/groups/corse/-/packages/pypi/simple
-mlir-sdist==21.1.2.2026012001
+mlir-sdist==21.1.2.2026021601
 mlir==21.1.2.2025091603
 xtc-mlir==21.1.2.2
diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index b4c9bfe34..fa85fd00c 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -28,13 +28,14 @@
     get_default_target,
 )
 from xtc.utils.ext_tools import get_shlib_extension
+from xtc.itf.runtime.common import CommonRuntimeInterface
 
 
 class MlirCompiler(itf.comp.Compiler):
     def __init__(
         self,
         backend: "backend.MlirBackend",
-        target: str | None = None,
+        target: str | CommonRuntimeInterface | None = None,
         **kwargs: Any,
     ):
         self._backend = backend
@@ -44,9 +45,12 @@ def __init__(
         self._config = MlirConfig(**kwargs)
         if target is None:
             self._target = get_default_target()(self._config)
-        else:
+        elif isinstance(target, str):
             self._target = get_target_from_name(target)(self._config)
+        elif isinstance(target, CommonRuntimeInterface):
+            self._target = get_target_from_name(target.target_name())(self._config)
         assert self._target is not None
+        self._runtime_target = target
         self._compiler_kwargs = kwargs
 
     @property
@@ -136,6 +140,7 @@ def mlir_insert_transform_pass(self) -> None:
             concluding_passes=self._config.concluding_passes,
             always_vectorize=self._config.always_vectorize,
             vectors_size=self._config.vectors_size,
+            target=self._target,
         )
         insert_transform_pass.run()
         if self._config.print_source_ir:
@@ -157,6 +162,8 @@ def _save_temp(self, fname: str, content: Any) -> None:
             outf.write(str(content))
 
     def _register_mlir_extensions(self) -> None:
+        for extension in self._config.required_extensions:
+            self._mlir_program.require_extension(extension, weak=False)
         if self._mlir_schedule is not None:
             for extension, weak in self._mlir_schedule.mlir_extensions.items():
                 self._mlir_program.require_extension(extension, weak=weak)
diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index de33ff28d..10d80eb55 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -38,6 +38,7 @@
 
 from .MlirProgram import RawMlirProgram
 from .MlirScheduler import MlirSchedule, MlirNodeSchedule
+from .MlirTarget import MlirTarget
 
 _VECTO_SEQ_NAME = "_vecto"
 _SUPER_VECTORIZE_SEQ_NAME = "_super_vectorize"
@@ -98,12 +99,14 @@ class MlirProgramInsertTransformPass:
     def __init__(
         self,
         mlir_program: RawMlirProgram,
+        target: MlirTarget,
         mlir_schedule: MlirSchedule | None = None,
         concluding_passes: list[str] = [],
         always_vectorize: bool = True,
         vectors_size: int | None = None,
     ) -> None:
         self._mlir_program = mlir_program
+        self._target = target
         self._mlir_schedule = mlir_schedule
         self._loc = Location.unknown(self._mlir_program.mlir_context)
         self._concluding_passes = concluding_passes
@@ -428,12 +431,15 @@ def _vectorize(self, sched_state: SchedulingState):
         if self._vectors_size is not None:
             return
 
-        transform.IncludeOp(
-            results_=[],
-            target=_VECTO_SEQ_NAME,
-            failure_propagation_mode=2,
-            operands_=[sched_state.handle],
-        )
+        if self._target.custom_vectorize():
+            self._target.apply_custom_vectorize(sched_state.handle)
+        else:
+            transform.IncludeOp(
+                results_=[],
+                target=_VECTO_SEQ_NAME,
+                failure_propagation_mode=2,
+                operands_=[sched_state.handle],
+            )
 
     def _post_vectorize(self, sched_state: SchedulingState):
         if self._vectors_size is not None:
diff --git a/src/xtc/backends/mlir/MlirConfig.py b/src/xtc/backends/mlir/MlirConfig.py
index 2d0ab5128..0823ebfd9 100644
--- a/src/xtc/backends/mlir/MlirConfig.py
+++ b/src/xtc/backends/mlir/MlirConfig.py
@@ -30,6 +30,7 @@ class MlirConfig:
     arch: str = "native"
     cpu: str = "native"
     selected_device: int | None = None
+    required_extensions: list[str] = field(default_factory=list)
 
     def __post_init__(self):
         object.__setattr__(
diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index 191cad027..ca9650e48 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -7,7 +7,14 @@
 
 from xdsl.dialects.func import FuncOp as xdslFuncOp
 from xdsl.dialects import func, memref
-from xdsl.dialects.builtin import MemRefType, f32, f64
+from xdsl.dialects.builtin import (
+    MemRefType,
+    f32,
+    f64,
+    ArrayAttr,
+    UnitAttr,
+    DictionaryAttr,
+)
 from xdsl.ir import Region, Block, Operation
 from xdsl.builder import ImplicitBuilder
 
@@ -97,6 +104,14 @@ def _init_from_graph(
             self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type))
             for tensor_type in [*inputs_types, *outputs_types]
         ]
+        arg_attrs = ArrayAttr(
+            [
+                DictionaryAttr(
+                    self._xdsl_attrs_from_tensortype(cast(XTCTensorType, tensor_type))
+                )
+                for tensor_type in [*inputs_types, *outputs_types]
+            ]
+        )
         inlined_block = Block(arg_types=params_types)
         variables = {
             name: arg
@@ -109,11 +124,11 @@ def _init_from_graph(
         with ImplicitBuilder(inlined_block):
             func.ReturnOp()
         region = Region([inlined_block])  # type: ignore # issue with mypy
-        payload = xdslFuncOp.from_region(
+        payload = xdslFuncOp(
             name=graph.name,
-            input_types=params_types,
-            return_types=[],
+            function_type=(params_types, []),
             region=region,
+            arg_attrs=arg_attrs,
         )
         nodes_dict = {}
         for attrs in block_attrs:
@@ -139,6 +154,11 @@ def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any:
         elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
         return MemRefType(elt_type, shape)
 
+    def _xdsl_attrs_from_tensortype(self, type: XTCTensorType):
+        if type.device is not None:
+            return {"memref.on_device": UnitAttr()}
+        return {}
+
     def _np_types_spec(
         self, types: list[MemRefType]
     ) -> list[dict[str, tuple[int, ...] | str]]:
diff --git a/src/xtc/backends/mlir/MlirProgram.py b/src/xtc/backends/mlir/MlirProgram.py
index ec9e68b1b..a9a77c179 100644
--- a/src/xtc/backends/mlir/MlirProgram.py
+++ b/src/xtc/backends/mlir/MlirProgram.py
@@ -73,23 +73,26 @@ def parse_and_add_function(
             function, context=self.mlir_context
         )
 
-        # Insert (or not) the noalias attributes
-        arg_attrs = []
-        if no_alias:
-            for _ in payload_func.arguments:
-                dict_attr = DictAttr.get(
-                    {
-                        "llvm.noalias": UnitAttr.get(context=self.mlir_context),
-                    },
-                    context=self.mlir_context,
+        with self.mlir_context:
+            # Insert (or not) the noalias attributes
+            new_arg_attrs = []
+            if no_alias:
+                for arg_attrs in payload_func.arg_attrs:
+                    new_dict = {}
+                    for i in range(len(arg_attrs)):
+                        new_dict[arg_attrs[i].name] = arg_attrs[i].attr
+                    new_dict["llvm.noalias"] = UnitAttr.get(context=self.mlir_context)
+                    new_arg_attrs.append(
+                        DictAttr.get(new_dict, context=self.mlir_context)
+                    )
+                payload_func.arg_attrs = ArrayAttr.get(
+                    new_arg_attrs, context=self.mlir_context
                 )
-                arg_attrs.append(dict_attr)
-            payload_func.arg_attrs = ArrayAttr.get(arg_attrs, context=self.mlir_context)
 
-        # Insert the function in the MLIR program
-        ip = InsertionPoint.at_block_begin(self.mlir_module.body)
-        ip.insert(payload_func)
-        name = str(payload_func.name).replace('"', "")
-        self.local_functions[str(name)] = payload_func
+            # Insert the function in the MLIR program
+            ip = InsertionPoint.at_block_begin(self.mlir_module.body)
+            ip.insert(payload_func)
+            name = str(payload_func.name).replace('"', "")
+            self.local_functions[str(name)] = payload_func
 
         return payload_func
diff --git a/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py
index 28d7f3dcc..b14174c88 100644
--- a/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py
+++ b/src/xtc/backends/mlir/MlirTarget/MlirCTarget.py
@@ -36,6 +36,7 @@
 from ..MlirProgram import RawMlirProgram
 
 from mlir.passmanager import PassManager
+from mlir.ir import OpResult
 
 __all__ = ["MlirCTarget"]
 
@@ -181,6 +182,14 @@ def create_module(
     ) -> itf.comp.Module:
         return HostModule(name, payload_name, file_name, file_type, graph, **kwargs)
 
+    @override
+    def custom_vectorize(self) -> bool:
+        return False
+
+    @override
+    def apply_custom_vectorize(self, handle: OpResult) -> None:
+        return
+
     def dump_ir(self, mlir_program: RawMlirProgram, title: str):
         print(f"// -----// {title} //----- //", file=sys.stderr)
         print(str(mlir_program.mlir_module), file=sys.stderr)
diff --git a/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py
index 60be1fef0..22a660bd0 100644
--- a/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py
+++ b/src/xtc/backends/mlir/MlirTarget/MlirLLVMTarget.py
@@ -37,6 +37,7 @@
 from ..MlirProgram import RawMlirProgram
 
 from mlir.passmanager import PassManager
+from mlir.ir import OpResult
 
 __all__ = ["MlirLLVMTarget"]
 
@@ -176,6 +177,14 @@ def create_module(
     ) -> itf.comp.Module:
         return HostModule(name, payload_name, file_name, file_type, graph, **kwargs)
 
+    @override
+    def custom_vectorize(self) -> bool:
+        return False
+
+    @override
+    def apply_custom_vectorize(self, handle: OpResult) -> None:
+        return
+
     def dump_ir(self, mlir_program: RawMlirProgram, title: str):
         print(f"// -----// {title} //----- //", file=sys.stderr)
         print(str(mlir_program.mlir_module), file=sys.stderr)
diff --git a/src/xtc/backends/mlir/MlirTarget/MlirMppaTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirMppaTarget.py
new file mode 100644
index 000000000..43cc5ed81
--- /dev/null
+++ b/src/xtc/backends/mlir/MlirTarget/MlirMppaTarget.py
@@ -0,0 +1,461 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing_extensions import override
+from typing import Any
+import subprocess
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+from xtc.utils.ext_tools import (
+    get_shlib_extension,
+    runtime_libs,
+    system_libs,
+    cc_bin,
+)
+
+from xtc.runtimes.accelerator.mppa import MppaConfig
+
+from xtc.targets.accelerator.mppa import MppaModule
+import xtc.itf as itf
+from xtc.itf.graph import Graph
+
+from .MlirTarget import MlirTarget
+from ..MlirConfig import MlirConfig
+from ..MlirProgram import RawMlirProgram
+
+from mlir.passmanager import PassManager
+from mlir.ir import OpResult
+from mlir.dialects import transform
+
+__all__ = ["MlirMppaTarget"]
+
+
+class MlirMppaTarget(MlirTarget):
+    """Kalray MPPA Target
+
+    This target implements the lowering and code generation to C
+    for the Kalray MPPA architecture, using the Mlir-Mppa backend.
+    """
+
+    def __init__(self, config: MlirConfig):
+        super().__init__(config)
+        # config.required_extensions.append("sdist")
+        self._mlir_mppa_backend = MlirMppaBackend(config)
+
+    @override
+    def name(self) -> str:
+        return "mppa"
+
+    @override
+    def arch(self) -> str:
+        return "kv3-2"
+
+    @override
+    def generate_code_for_target(
+        self,
+        mlir_program: RawMlirProgram,  # Will be modified in place
+        **kwargs: Any,
+    ) -> None:
+        save_temp = self._save_temp
+        save_temps_dir = self._config.save_temps_dir
+        temp_dir = None
+        dump_file = kwargs.get("dump_file", None)
+        if dump_file is None:
+            temp_dir = tempfile.mkdtemp()
+            dump_file = f"{temp_dir}/payload"
+        if self._config.save_temps:
+            assert dump_file is not None, "TODO: save_temp requires dump_file"
+            dump_tmp_dir = Path(save_temps_dir)
+            os.makedirs(save_temps_dir, exist_ok=True)
+        else:
+            dump_tmp_dir = Path(dump_file).parent
+        dump_base = Path(dump_file).name
+
+        dump_tmp_file = f"{dump_tmp_dir}/{dump_base}"
+        mlir_atrn_dump_file = f"{dump_base}.after_trn.mlir"
+        mlir_bmppa_dump_file = f"{dump_base}.before_mppa.mlir"
+        mlir_amppa_dump_file = f"{dump_base}.after_mppa.mlir"
+        c_host_dump_file = f"{dump_base}.host.c"
+        c_accelerator_dump_file = f"{dump_base}.accelerator.c"
+        obj_host_dump_file = f"{dump_base}.host.o"
+        obj_accelerator_dump_file = f"{dump_base}.accelerator.o"
+        so_dump_file = f"{dump_file}.{get_shlib_extension()}"
+        kvx_so_dump_file = f"{dump_file}.kvx.so"
+
+        # Lower to MLIR with MPPA dialect
+        save_temp(mlir_atrn_dump_file, mlir_program.mlir_module)
+        self._mlir_to_mppa_pass(mlir_program)
+
+        # Run MLIR MPPA backend
+        with open(mlir_bmppa_dump_file, "w") as outf:
+            outf.write(str(mlir_program.mlir_module))
+        self._mlir_mppa_backend.run_lowering(
+            mlir_before_mppa_dump_file=mlir_bmppa_dump_file,
+            mlir_after_mppa_dump_file=mlir_amppa_dump_file,
+        )
+        if self._config.print_lowered_ir:
+            print(f"// -----// IR Dump After MPPA Opt //----- //", file=sys.stderr)
+            with open(mlir_amppa_dump_file, "r") as inf:
+                print(inf.read(), file=sys.stderr)
+
+        # Generate C code for host and accelerator
+        self._mlir_mppa_backend.generate_c_host(
+            mlir_after_mppa_dump_file=mlir_amppa_dump_file,
+            c_host_dump_file=c_host_dump_file,
+        )
+        self._mlir_mppa_backend.generate_c_accelerator(
+            mlir_after_mppa_dump_file=mlir_amppa_dump_file,
+            c_accelerator_dump_file=c_accelerator_dump_file,
+        )
+
+        # Compile C code for accelerator
+        self._mlir_mppa_backend.compile_c_accelerator(
+            c_accelerator_dump_file=c_accelerator_dump_file,
+            obj_accelerator_dump_file=obj_accelerator_dump_file,
+        )
+        # Link KVX library
+        self._mlir_mppa_backend.link_kvx_library(
+            obj_accelerator_dump_file=obj_accelerator_dump_file,
+            kvx_so_dump_file=kvx_so_dump_file,
+        )
+
+        # Compile C code for host
+        self._mlir_mppa_backend.compile_c_host(
+            c_host_dump_file=c_host_dump_file,
+            obj_host_dump_file=obj_host_dump_file,
+            kvx_so_dump_file=kvx_so_dump_file,
+        )
+
+        # Link final shared library
+        self._mlir_mppa_backend.link_shared_library(
+            obj_host_dump_file=obj_host_dump_file,
+            obj_accelerator_dump_file=obj_accelerator_dump_file,
+            so_dump_file=so_dump_file,
+        )
+
+        # Remove intermediate files if needed
+        if not self._config.save_temps:
+            os.remove(mlir_bmppa_dump_file)
+            os.remove(mlir_amppa_dump_file)
+            os.remove(c_host_dump_file)
+            os.remove(c_accelerator_dump_file)
+            os.remove(obj_host_dump_file)
+            os.remove(obj_accelerator_dump_file)
+
+    @override
+    def create_module(
+        self,
+        name: str,
+        payload_name: str,
+        file_name: str,
+        file_type: str,
+        graph: Graph | None = None,
+        **kwargs: Any,
+    ) -> itf.comp.Module:
+        mppa_config = MppaConfig(self._config)
+        return MppaModule(
+            name, payload_name, file_name, file_type, mppa_config, graph, **kwargs
+        )
+
+    @override
+    def custom_vectorize(self) -> bool:
+        return True
+
+    @override
+    def apply_custom_vectorize(self, handle: OpResult) -> None:
+        transform.AnnotateOp(handle, "xtc.request_vectorization")
+
+    def dump_ir(self, mlir_program: RawMlirProgram, title: str):
+        print(f"// -----// {title} //----- //", file=sys.stderr)
+        print(str(mlir_program.mlir_module), file=sys.stderr)
+
+    def _mlir_to_mppa_pass(self, mlir_program: RawMlirProgram):
+        to_mppa_pass = MlirProgramToMlirMppaPass(
+            mlir_program=mlir_program,
+        )
+        to_mppa_pass.run()
+        if self._config.print_lowered_ir:
+            self.dump_ir(mlir_program, "IR Dump After MLIR Opt")
+
+    @property
+    def shared_libs(self):
+        return system_libs + [
+            f"{self._config.mlir_install_dir}/lib/{lib}" for lib in runtime_libs
+        ]
+
+    @property
+    def shared_path(self):
+        return [f"-Wl,-rpath,{self._config.mlir_install_dir}/lib/"]
+
+    def _save_temp(self, fname: str, content: Any) -> None:
+        if not self._config.save_temps:
+            return
+        os.makedirs(self._config.save_temps_dir, exist_ok=True)
+        with open(f"{self._config.save_temps_dir}/{fname}", "w") as outf:
+            outf.write(str(content))
+
+
+class MlirProgramToMlirMppaPass:
+    def __init__(
+        self,
+        mlir_program: RawMlirProgram,
+    ) -> None:
+        self._mlir_program = mlir_program
+
+    def _lowering_pipeline(self) -> list[str]:
+        pipeline = [
+            "cse",
+            "sccp",
+        ]
+        if "sdist" in self._mlir_program.mlir_extensions:
+            pipeline += [
+                "sdist-lower-distribution",
+                "cse",
+                "convert-sdist-to-mppa",
+                "cse",
+                "convert-sdist-utils-to-mppa",
+                "cse",
+                "canonicalize",
+                "cse",
+            ]
+        return pipeline
+
+    def run(self) -> None:
+        self._mlir_program.mlir_context.allow_unregistered_dialects = True
+        pm = PassManager(context=self._mlir_program.mlir_context)
+        pm.enable_verifier(False)
+        for opt in self._lowering_pipeline():
+            pm.add(opt)  # type: ignore # no attribte add?
+        pm.run(self._mlir_program.mlir_module.operation)
+        self._mlir_program.mlir_context.allow_unregistered_dialects = False
+
+
+class MlirMppaBackend:
+    def __init__(self, config: MlirConfig):
+        self._config = config
+        try:
+            import mlir_mppa
+        except ImportError:
+            raise ImportError(
+                "mlir_mppa is not installed but is required for MPPA target"
+            )
+        try:
+            self._csw_path = os.environ["KALRAY_TOOLCHAIN_DIR"]
+        except KeyError:
+            raise KeyError(
+                "Please source the Kalray Accesscore Toolchain: https://www.kalrayinc.com/products/software/"
+            )
+        self._mlir_mppa_path = mlir_mppa.__path__[0]
+
+    @property
+    def cmd_mppa_opt(self):
+        return [f"{self._mlir_mppa_path}/bin/mppa-opt"]
+
+    @property
+    def cmd_mppa_translate(self):
+        return [f"{self._mlir_mppa_path}/bin/mppa-translate"]
+
+    @property
+    def cmd_kvx_cc(self):
+        return [f"{self._csw_path}/bin/kvx-cos-gcc"]
+
+    @property
+    def cmd_host_cc(self):
+        return [cc_bin]
+
+    def _execute_command(
+        self,
+        cmd: list[str],
+        input_pipe: str | None = None,
+        pipe_stdoutput: bool = True,
+    ) -> subprocess.CompletedProcess:
+        pretty_cmd = "| " if input_pipe else ""
+        pretty_cmd += " ".join(cmd)
+        if self._config.debug:
+            print(f"> exec: {pretty_cmd}", file=sys.stderr)
+
+        if input_pipe and pipe_stdoutput:
+            result = subprocess.run(
+                cmd, input=input_pipe, stdout=subprocess.PIPE, text=True
+            )
+        elif input_pipe and not pipe_stdoutput:
+            result = subprocess.run(cmd, input=input_pipe, text=True)
+        elif not input_pipe and pipe_stdoutput:
+            result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+        else:
+            result = subprocess.run(cmd, text=True)
+        return result
+
+    def _lowering_pipeline(self) -> str:
+        passes = []
+        # TODO run these only if sdist is not present
+        # passes.append("func.func(mppa-launch{device=k300})")
+        # passes.append("func.func(kvxcluster-scf-forall-distribute{num-clusters=1})")
+        # passes.append("func.func(kvxcluster-launch)")
+        passes.append("canonicalize")
+        passes.append("func.func(mppa-load-weights)")
+        passes.append("func.func(mppa-copy-buffers)")
+        passes.append("canonicalize")
+        passes.append("func.func(kalray-lift-strided-memref-copy-to-linalg)")
+        passes.append("canonicalize")
+        passes.append("func.func(kvxcluster-lower-promoted-memory)")
+        passes.append(
+            "func.func(kvxcluster-optimize-dma-transfers{bundle=true pipeline=false})"
+        )
+        passes.append("canonicalize")
+        passes.append("func.func(kvxcluster-basic-static-allocation)")
+        passes.append("canonicalize")
+        passes.append("func.func(kalray-remove-useless-initializations)")
+        passes.append("canonicalize")
+        passes.append("func.func(kvxpe-scf-forall-distribute{num-pes=1})")
+        passes.append("func.func(kvxpe-launch)")
+        passes.append(
+            "func.func(kvxuks-catch{request-attribute=xtc.request_vectorization})"
+        )
+        passes.append("canonicalize")
+        passes.append("convert-linalg-to-loops")
+        passes.append("func.func(lower-affine)")
+        passes.append("func.func(expand-strided-metadata)")
+        passes.append("func.func(kvx-non-canonical-vectorize)")
+        passes.append("func.func(kvx-vectorize)")
+        passes.append("func.func(scf-forall-to-for)")
+        passes.append("convert-math-to-kvxisa")
+        passes.append("convert-math-to-libm")
+        passes.append("func.func(lower-affine)")
+        passes.append("cse")
+        # TODO Enable Mppa traces
+        ##if config.mppa_trace_enable:
+        ##    passes.append("func.func(kalray-request-benchmarks{target-op=kvxcluster.launch})")
+        ##    passes.append("kalray-apply-instrumentation{use-traces=" + str(config.mppa_trace_enable) + "}")
+        passes.append("func.func(kvxcluster-outline-kernels{specialize=true})")
+        passes.append("func.func(canonicalize)")
+
+        new_passes = []
+        for p in passes:
+            new_passes.append(p)
+            new_passes.append("cse")
+            # new_passes.append("canonicalize") # FIXME bug with kvxcluster.launch
+
+        # No cse or canonicalize must run after
+        new_passes.append(
+            "func.func(kalray-clone-crossing-constants)"
+        )  # TODO remove remaining useless
+        passes = new_passes
+
+        return "builtin.module(" + ",".join(passes) + ")"
+
+    def run_lowering(
+        self, mlir_before_mppa_dump_file: str, mlir_after_mppa_dump_file: str
+    ) -> None:
+        cmd = self.cmd_mppa_opt + [
+            "-pass-pipeline=" + self._lowering_pipeline(),
+            mlir_before_mppa_dump_file,
+            "-o",
+            mlir_after_mppa_dump_file,
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
+
+    def generate_c_host(
+        self, mlir_after_mppa_dump_file: str, c_host_dump_file: str
+    ) -> None:
+        cmd = self.cmd_mppa_translate + [
+            "--mlir-to-c-host",
+            mlir_after_mppa_dump_file,
+            "-o",
+            c_host_dump_file,
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
+
+    def generate_c_accelerator(
+        self, mlir_after_mppa_dump_file: str, c_accelerator_dump_file: str
+    ) -> None:
+        cmd = self.cmd_mppa_translate + [
+            "--mlir-to-c-accelerator",
+            mlir_after_mppa_dump_file,
+            "-o",
+            c_accelerator_dump_file,
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
+
+    def compile_c_accelerator(
+        self, c_accelerator_dump_file: str, obj_accelerator_dump_file: str
+    ) -> None:
+        cmd = self.cmd_kvx_cc + [
+            "-O2",
+            "-fPIC",
+            f"-I{self._mlir_mppa_path}/include",
+            "-march=kv3-2",
+            "-DBUILD_ID=0",
+            "-fvect-cost-model=cheap",
+            "-fstack-limit-register=sr",
+            "-c",
+            c_accelerator_dump_file,
+            "-o",
+            obj_accelerator_dump_file,
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
+
+    def link_kvx_library(
+        self, obj_accelerator_dump_file: str, kvx_so_dump_file: str
+    ) -> None:
+        cmd = self.cmd_kvx_cc + [
+            "-shared",
+            "-fPIC",
+            "-march=kv3-2",
+            "-Wl,-soname=libkvx.so",
+            obj_accelerator_dump_file,
+            "-o",
+            kvx_so_dump_file,
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
+
+    def compile_c_host(
+        self, c_host_dump_file: str, obj_host_dump_file: str, kvx_so_dump_file: str
+    ) -> None:
+        cmd = self.cmd_host_cc + [
+            "-O2",
+            "-fPIC",
+            "-Wall",
+            "-Wextra",
+            "-I" + self._mlir_mppa_path + "/include",
+            "-I" + self._csw_path + "/include",
+            "-DTARGET_KV3_2",
+            '-DKERNEL_PATHNAME="' + kvx_so_dump_file + '"',
+            "-c",
+            c_host_dump_file,
+            "-o",
+            obj_host_dump_file,
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
+
+    def link_shared_library(
+        self, obj_host_dump_file: str, obj_accelerator_dump_file: str, so_dump_file: str
+    ) -> None:
+        cmd = self.cmd_host_cc + [
+            "-shared",
+            "-fPIC",
+            "-O2",
+            obj_host_dump_file,
+            "-o",
+            so_dump_file,
+            "-Wl,-rpath,$ORIGIN/../lib",
+            "-L" + self._csw_path + "/lib",
+            "-lmppa_offload_host",
+            "-lmopd",
+            "-lmppa_rproc_host",
+            "-lpthread",
+            "-L" + self._mlir_mppa_path + "/_mlir_libs",
+            "-lmlir_c_runner_utils",
+        ]
+        exe_process = self._execute_command(cmd=cmd)
+        assert exe_process.returncode == 0
diff --git a/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py
index b5cbef330..ba4ba5a2e 100644
--- a/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py
+++ b/src/xtc/backends/mlir/MlirTarget/MlirNVGPUTarget.py
@@ -23,12 +23,12 @@
     cc_bin,
 )
 from xtc.utils.tools import get_cuda_prefix
-from xtc.targets.gpu import GPUModule
+from xtc.targets.accelerator.gpu import GPUModule
 import xtc.itf as itf
 from xtc.itf.graph import Graph
 
 from mlir.dialects import func
-from mlir.ir import UnitAttr
+from mlir.ir import UnitAttr, OpResult
 from mlir.passmanager import PassManager
 
 from .MlirTarget import MlirTarget
@@ -323,6 +323,14 @@ def create_module(
     ) -> itf.comp.Module:
         return GPUModule(name, payload_name, file_name, file_type, graph, **kwargs)
 
+    @override
+    def custom_vectorize(self) -> bool:
+        return False
+
+    @override
+    def apply_custom_vectorize(self, handle: OpResult) -> None:
+        return
+
     @property
     def disassemble_option(self):
         if not self._config.to_disassemble:
diff --git a/src/xtc/backends/mlir/MlirTarget/MlirTarget.py b/src/xtc/backends/mlir/MlirTarget/MlirTarget.py
index 886c49a1f..02a0f5eba 100644
--- a/src/xtc/backends/mlir/MlirTarget/MlirTarget.py
+++ b/src/xtc/backends/mlir/MlirTarget/MlirTarget.py
@@ -11,6 +11,8 @@
 import xtc.itf as itf
 from xtc.itf.graph import Graph
 
+from mlir.ir import OpResult
+
 __all__ = ["MlirTarget"]
 
 
@@ -65,3 +67,17 @@ def create_module(
         for the target.
         """
         ...
+
+    @abstractmethod
+    def custom_vectorize(self) -> bool:
+        """
+        Return True if the target needs to apply custom vectorization.
+        """
+        ...
+
+    @abstractmethod
+    def apply_custom_vectorize(self, handle: OpResult) -> None:
+        """
+        Apply the custom vectorization for the target.
+        """
+        ...
diff --git a/src/xtc/backends/mlir/MlirTarget/__init__.py b/src/xtc/backends/mlir/MlirTarget/__init__.py
index f61f178dd..0ccd44d7b 100644
--- a/src/xtc/backends/mlir/MlirTarget/__init__.py
+++ b/src/xtc/backends/mlir/MlirTarget/__init__.py
@@ -20,6 +20,10 @@ def get_target_from_name(name: str) -> type[MlirTarget]:
         from .MlirNVGPUTarget import MlirNVGPUTarget
 
         return MlirNVGPUTarget
+    elif name == "mppa":
+        from .MlirMppaTarget import MlirMppaTarget
+
+        return MlirMppaTarget
     else:
         raise NameError(f"'{name}' is not a known target")
 
diff --git a/src/xtc/csrcs/runtimes/gpu/perf_event_gpu.cpp b/src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.cpp
similarity index 100%
rename from src/xtc/csrcs/runtimes/gpu/perf_event_gpu.cpp
rename to src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.cpp
diff --git a/src/xtc/csrcs/runtimes/gpu/perf_event_gpu.h b/src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.h
similarity index 100%
rename from src/xtc/csrcs/runtimes/gpu/perf_event_gpu.h
rename to src/xtc/csrcs/runtimes/accelerator/gpu/perf_event_gpu.h
diff --git a/src/xtc/csrcs/runtimes/accelerator/mppa/host.c b/src/xtc/csrcs/runtimes/accelerator/mppa/host.c
new file mode 100644
index 000000000..1962bf81c
--- /dev/null
+++ b/src/xtc/csrcs/runtimes/accelerator/mppa/host.c
@@ -0,0 +1,130 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2024-2026 The XTC Project Authors
+ */
+#include "host_structures.h"
+#include "mlir_host_header.h"
+#include "mppa_management_host.h"
+
+#include <mppa_offload_host.h>
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+void *mppa_common_structures = NULL;
+mppa_offload_accelerator_t *mppa_accelerator = NULL;
+mppa_offload_sysqueue_t *master_sysqueue = NULL;
+size_t mppa_alloc_alignment = 64;
+
+bool mppa_init_device()
+{
+    if (mppa_common_structures == NULL) {
+        mppa_common_structures = mppa_pre_init();
+        // Get ctx
+        mppa_offload_ctx_ptr = (mppa_offload_host_context_t*) ((void**)mppa_common_structures)[0];
+        assert(mppa_offload_ctx_ptr != NULL);
+        // Get accelerator
+        mppa_accelerator = mppa_offload_get_accelerator(mppa_offload_ctx_ptr, 0);
+        assert(mppa_accelerator != NULL);
+        // Get main sysqueue
+        master_sysqueue = mppa_offload_get_sysqueue(mppa_accelerator, 0);
+        assert(master_sysqueue != NULL);
+    }
+    return true;
+}
+
+bool mppa_deinit_device()
+{
+    mppa_de_init();
+    mppa_common_structures = NULL;
+    return true;
+}
+
+void* mppa_get_common_structures()
+{
+    return mppa_common_structures;
+}
+
+void* mppa_create_memory_handle()
+{
+    void *handle = malloc(sizeof(mppa_buffer_t));
+    assert(handle != NULL);
+    return handle;
+}
+
+bool mppa_destroy_memory_handle(void *handle)
+{
+    free(handle);
+    return true;
+}
+
+void mppa_set_alloc_alignment(size_t alignment)
+{
+    mppa_alloc_alignment = alignment;
+}
+
+bool mppa_memory_allocate(void *handle, size_t size)
+{
+    assert(handle != NULL);
+    assert(master_sysqueue != NULL);
+    mppa_buffer_t *buffer = (mppa_buffer_t *)handle;
+    if (mppa_offload_alloc(master_sysqueue, size, mppa_alloc_alignment, MPPA_OFFLOAD_ALLOC_DDR, &(buffer->voffset), &(buffer->offset)) != 0) {
+        assert(0 && "Fail to alloc buffer\n");
+        return false;
+    }
+    return true;
+}
+
+bool mppa_memory_free(void *handle)
+{
+    assert(handle != NULL);
+    assert(master_sysqueue != NULL);
+    mppa_buffer_t *buffer = (mppa_buffer_t *)handle;
+    if (mppa_offload_free(master_sysqueue, MPPA_OFFLOAD_ALLOC_DDR, buffer->voffset) != 0) {
+        assert(0 && "Fail to dealloc buffer\n");
+        return false;
+    }
+    return true;
+}
+
+bool mppa_memory_copy_to(void *handle, void *src, size_t size)
+{
+    assert(handle != NULL);
+    assert(mppa_accelerator != NULL);
+    mppa_buffer_t *buffer = (mppa_buffer_t *)handle;
+    if (mppa_offload_write(mppa_accelerator, src, buffer->offset, size, NULL) != 0) {
+        assert(0 && "Failed write buffer\n");
+        return false;
+    }
+    return true;
+}
+
+bool mppa_memory_copy_from(void *handle, void *dst, size_t size)
+{
+    assert(handle != NULL);
+    assert(mppa_accelerator != NULL);
+    mppa_buffer_t *buffer = (mppa_buffer_t *)handle;
+    if (mppa_offload_read(mppa_accelerator, dst, buffer->offset, size, NULL) != 0) {
+        assert(0 && "Failed read buffer\n");
+        return false;
+    }
+    return true;
+}
+
+bool mppa_memory_fill_zero(void *handle, size_t size)
+{
+    void* tmp = calloc(size, 1);
+    assert(tmp != NULL);
+    bool res = mppa_memory_copy_to(handle, tmp, size);
+    free(tmp);
+    return res;
+}
+
+void* mppa_memory_data_pointer(void *handle)
+{
+    assert(handle != NULL);
+    mppa_buffer_t *buffer = (mppa_buffer_t *)handle;
+    return (void*)buffer->voffset;
+}
diff --git a/src/xtc/graphs/xtc/data.py b/src/xtc/graphs/xtc/data.py
index 2451de8a9..f26c35168 100644
--- a/src/xtc/graphs/xtc/data.py
+++ b/src/xtc/graphs/xtc/data.py
@@ -19,6 +19,8 @@
     ConstantDataType,
 )
 
+from xtc.itf.runtime.accelerator import AcceleratorDevice
+
 
 __all__ = [
     "XTCTensorType",
@@ -27,9 +29,15 @@
 
 
 class XTCTensorType(TensorType):
-    def __init__(self, shape: ShapeType = None, dtype: DataType = None):
+    def __init__(
+        self,
+        shape: ShapeType = None,
+        dtype: DataType = None,
+        device: AcceleratorDevice | None = None,
+    ):
         self._shape = shape
         self._dtype = dtype
+        self._device = device
 
     @property
     @override
@@ -41,6 +49,11 @@ def shape(self) -> ShapeType:
     def dtype(self) -> DataType:
         return self._dtype
 
+    @property
+    @override
+    def device(self) -> AcceleratorDevice | None:
+        return self._device
+
     @property
     @override
     def ndim(self) -> int:
diff --git a/src/xtc/graphs/xtc/expr.py b/src/xtc/graphs/xtc/expr.py
index e4ce0accd..c76a861ef 100644
--- a/src/xtc/graphs/xtc/expr.py
+++ b/src/xtc/graphs/xtc/expr.py
@@ -24,6 +24,8 @@
     XTCOperTranspose,
 )
 
+from xtc.itf.runtime.accelerator import AcceleratorDevice
+
 __all__ = [
     "XTCExpr",
     "XTCValueExpr",
@@ -116,12 +118,13 @@ def __init__(
         tensor: XTCTensorType | XTCTensor | ShapeType | None = None,
         shape: ShapeType | DataType = None,
         dtype: DataType = None,
+        device: AcceleratorDevice | None = None,
     ) -> None:
         super().__init__()
         if tensor is None:
             assert shape is None or isinstance(shape, tuple)
             assert dtype is None or isinstance(dtype, str)
-            type = XTCTensorType(shape=shape, dtype=dtype)
+            type = XTCTensorType(shape=shape, dtype=dtype, device=device)
             value = XTCTensor(type=type)
         elif isinstance(tensor, XTCTensorType):
             assert shape is None and dtype is None
@@ -133,11 +136,12 @@ def __init__(
             if shape is not None:
                 assert isinstance(shape, str)
                 assert dtype is None
-                type = XTCTensorType(shape=tensor, dtype=shape)
+                type = XTCTensorType(shape=tensor, dtype=shape, device=device)
             else:
-                type = XTCTensorType(shape=tensor, dtype=dtype)
+                type = XTCTensorType(shape=tensor, dtype=dtype, device=device)
             value = XTCTensor(type=type)
         self._value = value
+        self._device = device
         self._op = XTCOperTensor()
 
     @property
diff --git a/src/xtc/graphs/xtc/operators.py b/src/xtc/graphs/xtc/operators.py
index 1f65bd17e..a8addc0eb 100644
--- a/src/xtc/graphs/xtc/operators.py
+++ b/src/xtc/graphs/xtc/operators.py
@@ -12,6 +12,7 @@
 
 from xtc.itf.operator import Operator
 from xtc.itf.data import Tensor, TensorType
+from xtc.itf.runtime.accelerator import AcceleratorDevice
 
 from .data import XTCTensor, XTCTensorType
 from .operation import XTCOperation
@@ -30,8 +31,14 @@
 
 
 class XTCOperator(Operator):
-    def __init__(self, name: str, **attrs: XTCOperatorAttr) -> None:
+    def __init__(
+        self,
+        name: str,
+        device: AcceleratorDevice | None = None,
+        **attrs: XTCOperatorAttr,
+    ) -> None:
         self._name = name
+        self._device = device
         self._attrs = NS(**attrs)
 
     @property
@@ -43,11 +50,27 @@ def name(self) -> str:
     def attrs(self) -> XTCOperatorAttrs:
         return self._attrs
 
+    @property
+    def device(self) -> AcceleratorDevice | None:
+        return self._device
+
     @override
     def forward_types(
         self, inputs_types: Sequence[TensorType]
     ) -> Sequence[XTCTensorType]:
-        return [cast(XTCTensorType, inp_type) for inp_type in inputs_types]
+        if self.device is None:
+            return [cast(XTCTensorType, inp_type) for inp_type in inputs_types]
+        res_types = []
+        for inp_type in inputs_types:
+            inp_tensor_type = cast(XTCTensorType, inp_type)
+            res_types.append(
+                XTCTensorType(
+                    shape=inp_tensor_type.shape,
+                    dtype=inp_tensor_type.dtype,
+                    device=inp_tensor_type.device,
+                )
+            )
+        return res_types
 
     @override
     def forward(self, inputs: Sequence[Tensor]) -> Sequence[XTCTensor]:
@@ -88,8 +111,8 @@ def _get_operation(
 
 
 class XTCOperTensor(XTCOperator):
-    def __init__(self) -> None:
-        super().__init__("tensor")
+    def __init__(self, **attrs: XTCOperatorAttr) -> None:
+        super().__init__("tensor", **attrs)
 
     @override
     def get_operation(
@@ -112,8 +135,8 @@ def get_operation(
 
 
 class XTCOperMatmul(XTCOperator):
-    def __init__(self) -> None:
-        super().__init__("matmul")
+    def __init__(self, **attrs: XTCOperatorAttr) -> None:
+        super().__init__("matmul", **attrs)
 
     @override
     def get_operation(
@@ -155,8 +178,7 @@ def forward_types(
         )
         return [
             XTCTensorType(
-                shape=(i, j),
-                dtype=inputs_types[0].dtype,
+                shape=(i, j), dtype=inputs_types[0].dtype, device=self.device
             ),
         ]
 
@@ -268,6 +290,7 @@ def forward_types(
             XTCTensorType(
                 shape=tuple([*inputs_types[0].shape[:-3], oh, ow, f]),
                 dtype=inputs_types[0].dtype,
+                device=self.device,
             ),
         ]
 
@@ -299,8 +322,10 @@ class _OperPadImpl:
     def __init__(self, **attrs: XTCOperatorAttr) -> None:
         padding = attrs.get("padding", 0)
         constant_value = attrs.get("constant_value", 0)
+        device = attrs.get("device", None)
         self.padding = padding
         self.constant_value = constant_value
+        self.device = device
 
     def get_operation_variable(
         self,
@@ -352,8 +377,7 @@ def forward_types(
             dims_types = [value + pad for value in dims_types]
         return [
             XTCTensorType(
-                shape=tuple(dims_types),
-                dtype=inputs_types[0].dtype,
+                shape=tuple(dims_types), dtype=inputs_types[0].dtype, device=self.device
             ),
         ]
 
@@ -410,8 +434,15 @@ def __init__(self, **attrs: XTCOperatorAttr) -> None:
         )
         if isinstance(padding, dict):
             padding = {k: v for k, v in padding.items() if v != (0, 0)}
-        self.impl = _OperPadImpl(padding=padding, constant_value=constant_value)
-        super().__init__("pad", padding=padding, constant_value=constant_value)
+        super().__init__(
+            "pad",
+            padding=padding,
+            constant_value=constant_value,
+            device=attrs.get("device", None),
+        )
+        self.impl = _OperPadImpl(
+            padding=padding, constant_value=constant_value, device=self.device
+        )
 
     @override
     def get_operation(
@@ -486,8 +517,15 @@ def __init__(self, **attrs: XTCOperatorAttr) -> None:
         assert isinstance(constant_value, (int, float)), (
             f"constant_value need to be a number"
         )
-        self.impl = _OperPadImpl(padding=padding, constant_value=constant_value)
-        super().__init__("pad2d", padding=padding, constant_value=constant_value)
+        super().__init__(
+            "pad2d",
+            padding=padding,
+            constant_value=constant_value,
+            device=attrs.get("device", None),
+        )
+        self.impl = _OperPadImpl(
+            padding=padding, constant_value=constant_value, device=self.device
+        )
 
     @override
     def get_operation(
@@ -602,8 +640,7 @@ def forward_types(
             dims_types = [value - pad for value in dims_types]
         return [
             XTCTensorType(
-                shape=tuple(dims_types),
-                dtype=inputs_types[0].dtype,
+                shape=tuple(dims_types), dtype=inputs_types[0].dtype, device=self.device
             ),
         ]
 
@@ -668,8 +705,7 @@ def forward_types(
         out_shape = tuple([x if x != -1 else size // fixed_size for x in self._shape])
         return [
             XTCTensorType(
-                shape=out_shape,
-                dtype=inputs_types[0].dtype,
+                shape=out_shape, dtype=inputs_types[0].dtype, device=self.device
             ),
         ]
 
@@ -701,8 +737,7 @@ def forward_types(
             out_shape = tuple([shape[n] for n in self.attrs.axes])
         return [
             XTCTensorType(
-                shape=out_shape,
-                dtype=inputs_types[0].dtype,
+                shape=out_shape, dtype=inputs_types[0].dtype, device=self.device
             ),
         ]
 
diff --git a/src/xtc/itf/data/tensor.py b/src/xtc/itf/data/tensor.py
index 9695a0fd8..823864142 100644
--- a/src/xtc/itf/data/tensor.py
+++ b/src/xtc/itf/data/tensor.py
@@ -7,6 +7,7 @@
 from typing_extensions import override
 import numpy.typing
 
+from xtc.itf.runtime.accelerator import AcceleratorDevice
 
 ShapeType: TypeAlias = tuple[int | str | None, ...] | None
 DataType: TypeAlias = str | None
@@ -43,6 +44,16 @@ def dtype(self) -> DataType:
         """
         ...
 
+    @property
+    @abstractmethod
+    def device(self) -> AcceleratorDevice | None:
+        """Returns the device of the tensor.
+
+        Returns:
+            The device of the tensor
+        """
+        ...
+
     @property
     @abstractmethod
     def ndim(self) -> int:
diff --git a/src/xtc/itf/runtime/accelerator.py b/src/xtc/itf/runtime/accelerator.py
new file mode 100644
index 000000000..16fddf0d6
--- /dev/null
+++ b/src/xtc/itf/runtime/accelerator.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from abc import ABC, abstractmethod
+from typing import Any, Callable
+import ctypes
+
+from xtc.itf.comp.module import Module
+from xtc.itf.runtime.common import CommonRuntimeInterface
+
+
+class AcceleratorDevice(CommonRuntimeInterface, ABC):
+    """Abstract interface for an accelerator device (such as GPU, MPPA, etc)."""
+
+    @abstractmethod
+    def detect_accelerator(self) -> bool:
+        """Detect if the accelerator device is available.
+
+        Returns:
+            A boolean representing if the accelerator device is available.
+        """
+        ...
+
+    @abstractmethod
+    def init_device(self) -> None:
+        """Initialize the accelerator device.
+
+        This method is called to initialize the accelerator device.
+        """
+        ...
+
+    @abstractmethod
+    def deinit_device(self) -> None:
+        """Deinitialize the accelerator device.
+
+        This method is called to deinitialize the accelerator device.
+        """
+        ...
+
+    @abstractmethod
+    def load_module(self, module: Module) -> None:
+        """Load a module on the accelerator device.
+
+        Args:
+            module (AcceleratorModule): The module to load.
+        """
+        ...
+
+    @abstractmethod
+    def get_module_function(self, module: Module, function_name: str) -> Callable:
+        """Get a function from a module on the accelerator device.
+
+        Args:
+            module (AcceleratorModule): The module to get the function from.
+            function_name (str): The name of the function to get.
+        """
+        ...
+
+    @abstractmethod
+    def unload_module(self, module: Module) -> None:
+        """Unload a module from the accelerator device.
+
+        Args:
+            module (AcceleratorModule): The module to unload.
+        """
+        ...
+
+    @abstractmethod
+    def memory_allocate(self, size_bytes: int) -> Any:
+        """Allocate memory on the accelerator device.
+
+        Args:
+            size_bytes (int): The size in bytes to allocate.
+
+        Returns:
+            A handle or reference to the allocated memory.
+        """
+        ...
+
+    @abstractmethod
+    def memory_free(self, handle: Any) -> None:
+        """Free memory on the accelerator device.
+
+        Args:
+            handle (Any): The handle to the memory to free.
+        """
+        ...
+
+    @abstractmethod
+    def memory_copy_to(
+        self, acc_handle: Any, src: ctypes.c_void_p, size_bytes: int
+    ) -> None:
+        """Copy memory from the host to the accelerator device.
+
+        Args:
+            acc_handle (Any): The handle to the memory to copy to.
+            src (ctypes.c_void_p): The source data pointer.
+            size_bytes (int): The size in bytes to copy.
+        """
+        ...
+
+    @abstractmethod
+    def memory_copy_from(
+        self, acc_handle: Any, dst: ctypes.c_void_p, size_bytes: int
+    ) -> None:
+        """Copy memory from the accelerator device to the host.
+
+        Args:
+            acc_handle (Any): The handle to the memory to copy from.
+            dst (ctypes.c_void_p): The destination data pointer.
+            size_bytes (int): The size in bytes to copy.
+        """
+        ...
+
+    @abstractmethod
+    def memory_fill_zero(self, acc_handle: Any, size_bytes: int) -> None:
+        """Fill memory on the accelerator device with zeros.
+
+        Args:
+            acc_handle (Any): The handle to the memory to fill with zeros.
+            size_bytes (int): The size in bytes to fill with zeros.
+        """
+        ...
+
+    @abstractmethod
+    def memory_data_pointer(self, acc_handle: Any) -> ctypes.c_void_p:
+        """Get the data pointer of the memory on the accelerator device.
+
+        Args:
+            acc_handle (Any): The handle to the memory to get the data pointer of.
+        """
+        ...
+
+    # TODO: describe hardware architecture
+    # TODO: profiling and traces
diff --git a/src/xtc/itf/runtime/common.py b/src/xtc/itf/runtime/common.py
new file mode 100644
index 000000000..dd8a60d6e
--- /dev/null
+++ b/src/xtc/itf/runtime/common.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from abc import ABC, abstractmethod
+from typing import Any
+
+from xtc.runtimes.types.dlpack import DLDataType, DLDevice
+from xtc.utils.cfunc import CFunc
+
+
+class CommonRuntimeInterface(ABC):
+    """Abstract interface for a common runtime interface."""
+
+    @abstractmethod
+    def target_name(self) -> str:
+        """Get the name of the target.
+
+        Returns:
+            A string representing the name of the target.
+        """
+        ...
+
+    @abstractmethod
+    def device_name(self) -> str:
+        """Get the name of the device.
+
+        Returns:
+            A string representing the name of the device.
+        """
+        ...
+
+    @abstractmethod
+    def device_arch(self) -> str:
+        """Get the architecture of the device.
+
+        Returns:
+            A string representing the architecture of the device.
+        """
+        ...
+
+    @abstractmethod
+    def device_id(self) -> int:
+        """Get the ID of the device.
+
+        Returns:
+            An integer representing the ID of the device.
+        """
+        ...
+
+    @abstractmethod
+    def evaluate(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        nargs: int,
+        cfunc: CFunc,
+        args: Any,
+    ) -> None:
+        """Evaluate a function with timing measurements.
+
+        Args:
+            results: Pointer to array of doubles to store timing results.
+            repeat: Number of times to repeat the measurement.
+            number: Number of function calls per repeat.
+            nargs: Number of arguments passed to the function.
+            cfunc: Function pointer to evaluate.
+            args: Pointer to array of void pointers containing function arguments.
+        """
+        ...
+
+    @abstractmethod
+    def evaluate_perf(
+        self,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args_tuples: list[Any],
+    ) -> list[float]:
+        """Evaluate a function with performance counter measurements.
+
+        Args:
+            pmu_events: List of performance events to measure.
+            repeat: Number of times to repeat the measurement.
+            number: Number of function calls per repeat.
+            min_repeat_ms: Minimum time in milliseconds for each repeat.
+            cfunc: Function pointer to evaluate.
+            args_tuples: List of argument tuples.
+        """
+        ...
+
+    @abstractmethod
+    def evaluate_packed(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        """Evaluate a packed function with timing measurements.
+
+        Args:
+            results: Pointer to array of doubles to store timing results.
+            repeat: Number of times to repeat the measurement.
+            number: Number of function calls per repeat.
+            min_repeat_ms: Minimum time in milliseconds for each repeat.
+            cfunc: Packed function pointer to evaluate.
+            args: Pointer to array of packed arguments.
+            codes: Pointer to array of integers containing argument type codes.
+            nargs: Number of arguments.
+        """
+        ...
+
+    @abstractmethod
+    def evaluate_packed_perf(
+        self,
+        results: Any,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        """Evaluate a packed function with performance counter measurements.
+
+        Args:
+            results: Pointer to array of doubles to store performance results.
+            pmu_events: List of performance events to measure.
+            repeat: Number of times to repeat the measurement.
+            number: Number of function calls per repeat.
+            min_repeat_ms: Minimum time in milliseconds for each repeat.
+            cfunc: Packed function pointer to evaluate.
+            args: Pointer to array of packed arguments.
+            codes: Pointer to array of integers containing argument type codes.
+            nargs: Number of arguments.
+        """
+        ...
+
+    @abstractmethod
+    def cndarray_new(
+        self,
+        ndim: int,
+        shape: Any,
+        dtype: DLDataType,
+        device: DLDevice,
+    ) -> Any:
+        """Create a new CNDArray.
+
+        Args:
+            ndim: Number of dimensions.
+            shape: Pointer to array of int64 containing shape dimensions.
+            dtype: Data type descriptor.
+            device: Device descriptor.
+
+        Returns:
+            Pointer to the created CNDArray, or None on failure.
+        """
+        ...
+
+    @abstractmethod
+    def cndarray_del(self, handle: Any) -> None:
+        """Delete a CNDArray.
+
+        Args:
+            handle: Pointer to the CNDArray to delete.
+        """
+        ...
+
+    @abstractmethod
+    def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None:
+        """Copy data from a data handle into a CNDArray.
+
+        Args:
+            handle: Pointer to the destination CNDArray.
+            data_handle: Pointer to the source data.
+        """
+        ...
+
+    @abstractmethod
+    def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None:
+        """Copy data from a CNDArray to a data handle.
+
+        Args:
+            handle: Pointer to the source CNDArray.
+            data_handle: Pointer to the destination data.
+        """
+        ...
+
+    @abstractmethod
+    def evaluate_flops(self, dtype_name: str | bytes) -> float:
+        """Evaluate the peak floating-point operations per second for a given data type.
+
+        Args:
+            dtype_name: Data type name as string or bytes (e.g., "float32").
+
+        Returns:
+            Peak FLOPS as a double, or 0.0 if the data type is not supported.
+        """
+        ...
diff --git a/src/xtc/itf/runtime/embedded.py b/src/xtc/itf/runtime/embedded.py
new file mode 100644
index 000000000..b97a9339e
--- /dev/null
+++ b/src/xtc/itf/runtime/embedded.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from abc import ABC, abstractmethod
+
+from xtc.itf.runtime.common import CommonRuntimeInterface
+
+
+class EmbeddedDevice(CommonRuntimeInterface, ABC):
+    """Abstract interface for an embedded device."""
+
+    @abstractmethod
+    def flash(self, image_path: str) -> None:
+        """Flash a binary image to the device.
+
+        Args:
+            image_path (str): Path to the binary image to flash.
+        """
+        ...
+
+    # TODO
diff --git a/src/xtc/runtimes/accelerator/gpu/GPUDevice.py b/src/xtc/runtimes/accelerator/gpu/GPUDevice.py
new file mode 100644
index 000000000..5afaa7f07
--- /dev/null
+++ b/src/xtc/runtimes/accelerator/gpu/GPUDevice.py
@@ -0,0 +1,355 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+import logging
+import ctypes
+import logging
+import ctypes
+from pathlib import Path
+from typing import Any, Callable
+from typing_extensions import override
+
+from xtc.itf.runtime.accelerator import AcceleratorDevice
+from xtc.itf.comp.module import Module
+from xtc.utils.cfunc import CFunc, _str_list_to_c, _c_ascii_str
+
+from ...host.runtime import resolve_runtime, RuntimeType, runtime_funcs
+
+__all__ = ["GPUDevice"]
+
+logger = logging.getLogger(__name__)
+
+# Can be set to True for RUNTIME_DEBUG
+RUNTIME_DEBUG = False
+
+from xtc.runtimes.types.dlpack import DLDevice, DLDataType
+
+from xtc.utils.loader import LibLoader
+from xtc.utils.tools import get_mlir_prefix
+from xtc.utils.ext_tools import cuda_runtime_lib
+
+
+class GPUDevice(AcceleratorDevice):
+    """A class for GPU device"""
+
+    # This is a singleton class; only one instance of GPUDevice will ever be created.
+    _instance = None
+
+    def __new__(cls, *args: Any, **kwargs: Any) -> "GPUDevice":
+        if cls._instance is None:
+            cls._instance = super(GPUDevice, cls).__new__(cls)
+            cls._instance.__init_once__(*args)
+        return cls._instance
+
+    def __init__(self):
+        # TODO check installation of cuda
+        pass
+
+    def __init_once__(self):
+        self._mlir_runtime_lib = LibLoader(
+            f"{get_mlir_prefix()}/lib/{cuda_runtime_lib}"
+        )
+        self.loaded_kernels: dict[Module, LibLoader] = {}
+        create_stream_func_name = "mgpuStreamCreate"
+        create_stream_func = getattr(
+            self._mlir_runtime_lib.lib, create_stream_func_name
+        )
+        assert create_stream_func is not None, (
+            f"Cannot find symbol {create_stream_func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        create_stream_func.argtypes = []
+        create_stream_func.restype = ctypes.c_voidp
+        self._custream = create_stream_func()
+
+    def __get_runtime_func(self, name: str) -> Callable:
+        if name in runtime_funcs:
+            entries = resolve_runtime(RuntimeType.GPU)
+            assert entries is not None
+            return entries[name]
+        raise AttributeError(f"undefined runtime function: {name}")
+
+    def __del__(self):
+        remaining_modules = list(self.loaded_kernels.keys())
+        for module in remaining_modules:
+            self.unload_module(module)
+        self._mlir_runtime_lib.close()
+        self._instance = None
+
+    @override
+    def detect_accelerator(self) -> bool:
+        raise NotImplementedError("GPUDevice.detect_accelerator is not implemented")
+
+    @override
+    def target_name(self) -> str:
+        return "nvgpu"
+
+    @override
+    def device_name(self) -> str:
+        return "nvgpu"
+
+    @override
+    def device_arch(self) -> str:
+        return "cuda"
+
+    @override
+    def device_id(self) -> int:
+        return 0  # TODO: Handle multiple GPUs
+
+    @override
+    def init_device(self) -> None:
+        # Not necessary for now
+        pass
+
+    @override
+    def deinit_device(self) -> None:
+        # Not necessary for now
+        pass
+
+    @override
+    def load_module(self, module: Module) -> None:
+        libloader = LibLoader(str(Path(module.file_name).absolute()))
+        self.loaded_kernels[module] = libloader
+
+    @override
+    def get_module_function(self, module: Module, function_name: str) -> Callable:
+        if module not in self.loaded_kernels.keys():
+            raise Exception("Kernel is not loaded")
+        func = getattr(self.loaded_kernels[module].lib, function_name)
+        assert func is not None, (
+            f"Cannot find symbol {function_name} in lib {module.file_name}"
+        )
+        return func
+
+    @override
+    def unload_module(self, module: Module) -> None:
+        if module not in self.loaded_kernels.keys():
+            raise Exception("Kernel is not loaded")
+        self.loaded_kernels[module].close()
+        self.loaded_kernels.pop(module)
+
+    @override
+    def memory_allocate(self, size_bytes: int) -> Any:
+        func_name = "mgpuMemAlloc"
+        func = getattr(self._mlir_runtime_lib.lib, func_name)
+        assert func is not None, (
+            f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        func.argtypes = [ctypes.c_uint64, ctypes.c_voidp, ctypes.c_bool]
+        func.restype = ctypes.c_voidp
+        return func(size_bytes, self._custream, True)
+
+    @override
+    def memory_free(self, handle: Any) -> None:
+        func_name = "mgpuMemFree"
+        func = getattr(self._mlir_runtime_lib.lib, func_name)
+        assert func is not None, (
+            f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        func.argtypes = [ctypes.c_voidp, ctypes.c_voidp]
+        func.restype = None
+        func(handle, self._custream)
+
+    @override
+    def memory_copy_to(
+        self, acc_handle: Any, src: ctypes.c_void_p, size_bytes: int
+    ) -> None:
+        # Copy memory to accelerator device
+        func_name = "mgpuMemcpy"
+        func = getattr(self._mlir_runtime_lib.lib, func_name)
+        assert func is not None, (
+            f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        func.argtypes = [
+            ctypes.c_voidp,
+            ctypes.c_voidp,
+            ctypes.c_uint64,
+            ctypes.c_voidp,
+        ]
+        func.restype = None
+        func(acc_handle, src, size_bytes, self._custream)
+        # Synchronize stream
+        sync_stream_func_name = "mgpuStreamSynchronize"
+        sync_stream_func = getattr(self._mlir_runtime_lib.lib, sync_stream_func_name)
+        assert sync_stream_func is not None, (
+            f"Cannot find symbol {sync_stream_func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        sync_stream_func.argtypes = [ctypes.c_voidp]
+        sync_stream_func.restype = None
+        sync_stream_func(self._custream)
+
+    @override
+    def memory_copy_from(
+        self, acc_handle: Any, dst: ctypes.c_void_p, size_bytes: int
+    ) -> None:
+        # Copy memory from accelerator device to host
+        func_name = "mgpuMemcpy"
+        func = getattr(self._mlir_runtime_lib.lib, func_name)
+        assert func is not None, (
+            f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        func.argtypes = [
+            ctypes.c_voidp,
+            ctypes.c_voidp,
+            ctypes.c_uint64,
+            ctypes.c_voidp,
+        ]
+        func.restype = None
+        func(dst, acc_handle, size_bytes, self._custream)
+        # Synchronize stream
+        sync_stream_func_name = "mgpuStreamSynchronize"
+        sync_stream_func = getattr(self._mlir_runtime_lib.lib, sync_stream_func_name)
+        assert sync_stream_func is not None, (
+            f"Cannot find symbol {sync_stream_func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        sync_stream_func.argtypes = [ctypes.c_voidp]
+        sync_stream_func.restype = None
+        sync_stream_func(self._custream)
+
+    @override
+    def memory_fill_zero(self, acc_handle: Any, size_bytes: int) -> None:
+        raise NotImplementedError("memory_fill_zero is not implemented for GPU device")
+
+    @override
+    def memory_data_pointer(self, acc_handle: Any) -> ctypes.c_void_p:
+        return ctypes.cast(acc_handle, ctypes.c_void_p)
+
+    @override
+    def evaluate(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        nargs: int,
+        cfunc: CFunc,
+        args: Any,
+    ) -> None:
+        self.__get_runtime_func("evaluate")(
+            ctypes.cast(results, ctypes.POINTER(ctypes.c_double)),
+            ctypes.c_int(repeat),
+            ctypes.c_int(number),
+            ctypes.c_int(nargs),
+            ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
+            ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)),
+        )
+
+    @override
+    def evaluate_perf(
+        self,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args_tuples: list[Any],
+    ) -> list[float]:
+        args_array = (ctypes.c_voidp * len(args_tuples))(
+            *[arg[0] for arg in args_tuples]
+        )
+        values_num = 1
+        if len(pmu_events) > 0:
+            values_num = len(pmu_events)
+            # FIXME check if the PMU events are supported by the target
+        results_array = (ctypes.c_double * (repeat * values_num))()
+        self.__get_runtime_func("evaluate_perf")(
+            ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)),
+            ctypes.c_int(len(pmu_events)),
+            _str_list_to_c(pmu_events),
+            ctypes.c_int(repeat),
+            ctypes.c_int(number),
+            ctypes.c_int(min_repeat_ms),
+            ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
+            ctypes.cast(args_array, ctypes.POINTER(ctypes.c_voidp)),
+            ctypes.c_int(len(args_tuples)),
+        )
+        return [float(x) for x in results_array]
+
+    @override
+    def evaluate_packed(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        raise NotImplementedError("evaluate_packed is not implemented for GPU device")
+
+    @override
+    def evaluate_packed_perf(
+        self,
+        results: Any,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        raise NotImplementedError(
+            "evaluate_packed_perf is not implemented for GPU device"
+        )
+
+    @override
+    def cndarray_new(
+        self,
+        ndim: int,
+        shape: Any,
+        dtype: DLDataType,
+        device: DLDevice,
+    ) -> Any:
+        # Convert shape if it's a list/tuple to ctypes array
+        if isinstance(shape, (list, tuple)):
+            shape_array = (ctypes.c_int64 * len(shape))(*shape)
+            shape = ctypes.cast(shape_array, ctypes.POINTER(ctypes.c_int64))
+        return self.__get_runtime_func("cndarray_new")(
+            ctypes.c_int32(ndim),
+            shape,
+            dtype,
+            device,
+        )
+
+    @override
+    def cndarray_del(self, handle: Any) -> None:
+        self.__get_runtime_func("cndarray_del")(handle)
+
+    @override
+    def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None:
+        self.__get_runtime_func("cndarray_copy_from_data")(handle, data_handle)
+
+    @override
+    def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None:
+        self.__get_runtime_func("cndarray_copy_to_data")(handle, data_handle)
+
+    @override
+    def evaluate_flops(self, dtype_name: str | bytes) -> float:
+        return float(
+            self.__get_runtime_func("evaluate_flops")(
+                _c_ascii_str.from_param(dtype_name)
+            )
+        )
+
+    # Extra methods
+    def _register_buffer(self, handle: Any, size_bytes: int) -> None:
+        nb_bytes_c = ctypes.c_int64(size_bytes)
+        buffer_ptr = ctypes.cast(handle, ctypes.c_void_p)
+        func_name = "mgpuMemHostRegister"
+        func = getattr(self._mlir_runtime_lib.lib, func_name)
+        assert func is not None, (
+            f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        func(buffer_ptr, nb_bytes_c)
+
+    def _unregister_buffer(self, handle: Any) -> None:
+        buffer_ptr = ctypes.cast(handle, ctypes.c_void_p)
+        func_name = "mgpuMemHostUnregister"
+        func = getattr(self._mlir_runtime_lib.lib, func_name)
+        assert func is not None, (
+            f"Cannot find symbol {func_name} in lib {self._mlir_runtime_lib.lib}"
+        )
+        func(buffer_ptr)
diff --git a/src/xtc/runtimes/accelerator/gpu/__init__.py b/src/xtc/runtimes/accelerator/gpu/__init__.py
new file mode 100644
index 000000000..adfff5caa
--- /dev/null
+++ b/src/xtc/runtimes/accelerator/gpu/__init__.py
@@ -0,0 +1,7 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from .GPUDevice import GPUDevice
+
+__all__ = ["GPUDevice"]
diff --git a/src/xtc/runtimes/accelerator/mppa/MppaDevice.py b/src/xtc/runtimes/accelerator/mppa/MppaDevice.py
new file mode 100644
index 000000000..bbb982935
--- /dev/null
+++ b/src/xtc/runtimes/accelerator/mppa/MppaDevice.py
@@ -0,0 +1,576 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+import ctypes
+import subprocess
+import logging
+import os
+import subprocess
+import ctypes
+import sys
+from pathlib import Path
+from typing import Any, Callable
+from typing_extensions import override
+
+from xtc.itf.runtime.accelerator import AcceleratorDevice
+from xtc.itf.comp.module import Module
+from xtc.utils.cfunc import CFunc
+
+__all__ = ["MppaDevice"]
+
+logger = logging.getLogger(__name__)
+
+# Can be set to True for RUNTIME_DEBUG
+RUNTIME_DEBUG = False
+
+from xtc.runtimes.types.dlpack import DLDevice, DLDataType
+
+from xtc.utils.ext_tools import cc_bin
+
+from .config import MppaConfig
+from xtc.utils.loader import LibLoader
+from xtc.runtimes.host.HostRuntime import HostRuntime
+
+MAX_NB_LOADED_KERNELS = 10
+
+
+def _get_csrcs_dir():
+    return Path(__file__).parents[3] / "csrcs" / "runtimes" / "accelerator" / "mppa"
+
+
+def _execute_command(
+    cmd: list[str],
+    input_pipe: str | None = None,
+    pipe_stdoutput: bool = True,
+    debug: bool = False,
+) -> subprocess.CompletedProcess:
+    pretty_cmd = "| " if input_pipe else ""
+    pretty_cmd += " ".join(cmd)
+    if debug:
+        print(f"> exec: {pretty_cmd}", file=sys.stderr)
+
+    if input_pipe and pipe_stdoutput:
+        result = subprocess.run(
+            cmd, input=input_pipe, stdout=subprocess.PIPE, text=True
+        )
+    elif input_pipe and not pipe_stdoutput:
+        result = subprocess.run(cmd, input=input_pipe, text=True)
+    elif not input_pipe and pipe_stdoutput:
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+    else:
+        result = subprocess.run(cmd, text=True)
+    return result
+
+
+def _compile_kvx_object(device: "MppaDevice", src_file: str, obj_file: str):
+    cmd_kvx_cc = [f"{device._csw_path}/bin/kvx-cos-gcc"]
+    cmd = cmd_kvx_cc + [
+        "-O2",
+        "-fPIC",
+        f"-I{device._mlir_mppa_path}/include",
+        "-march=kv3-2",
+        "-c",
+        src_file,
+        "-o",
+        obj_file,
+    ]
+    return _execute_command(cmd=cmd, debug=device.config.mlir_config.debug)
+
+
+def _compile_host_object(device: "MppaDevice", src_file: str, obj_file: str):
+    cmd_host_cc = [cc_bin]
+    cmd = cmd_host_cc + [
+        "-O2",
+        "-fPIC",
+        "-Wall",
+        "-Wextra",
+        f"-I{device._mlir_mppa_path}/include",
+        f"-I{device._csw_path}/include",
+        f"-I{_get_csrcs_dir()}",
+        "-DNB_CC=5",
+        "-DTARGET_KV3_2",
+        '-DKERNEL_PATHNAME="' + device.config.work_dir + "/mppa_runtime_acc.so" + '"',
+        "-c",
+        src_file,
+        "-o",
+        obj_file,
+    ]
+    return _execute_command(cmd=cmd, debug=device.config.mlir_config.debug)
+
+
+def _compile_runtime_lib(device: "MppaDevice") -> LibLoader:
+    kvx_src_files = [
+        device._mlir_mppa_path + "/src/runtime/mppa_management_accelerator.c",
+    ]
+    host_src_files = [
+        device._mlir_mppa_path + "/src/runtime/mppa_management_host.c",
+        str(_get_csrcs_dir() / "host.c"),
+    ]
+
+    # Compile KVX objects
+    kvx_obj_files = [
+        f"{device.config.work_dir}/{Path(file).stem}.o" for file in kvx_src_files
+    ]
+    for src_file, obj_file in zip(kvx_src_files, kvx_obj_files):
+        _compile_kvx_object(device, src_file, obj_file)
+    # Link KVX objects
+    cmd_kvx_cc = [f"{device._csw_path}/bin/kvx-cos-gcc"]
+    cmd_kvx_link = cmd_kvx_cc + [
+        "-shared",
+        "-fPIC",
+        "-march=kv3-2",
+        "-Wl,-soname=mppa_runtime_acc.so",
+        *kvx_obj_files,
+        "-o",
+        device.config.work_dir + "/mppa_runtime_acc.so",
+    ]
+    exe_process = _execute_command(
+        cmd=cmd_kvx_link, debug=device.config.mlir_config.debug
+    )
+    assert exe_process.returncode == 0
+
+    # Compile host objects
+    host_obj_files = [
+        f"{device.config.work_dir}/{Path(file).stem}.o" for file in host_src_files
+    ]
+    for src_file, obj_file in zip(host_src_files, host_obj_files):
+        _compile_host_object(device, src_file, obj_file)
+    # Link host objects
+    cmd_host_cc = [cc_bin]
+    cmd_host_link = cmd_host_cc + [
+        "-shared",
+        "-fPIC",
+        "-O2",
+        *host_obj_files,
+        "-o",
+        device.config.work_dir + "/mppa_runtime_host.so",
+        "-Wl,-rpath,$ORIGIN/../lib",
+        "-L" + device._csw_path + "/lib",
+        "-lmppa_offload_host",
+        "-lmopd",
+        "-lmppa_rproc_host",
+        "-lpthread",
+    ]
+    exe_process = _execute_command(
+        cmd=cmd_host_link, debug=device.config.mlir_config.debug
+    )
+    assert exe_process.returncode == 0
+
+    return LibLoader(device.config.work_dir + "/mppa_runtime_host.so")
+
+
+class MppaDevice(AcceleratorDevice):
+    """A class for Mppa device"""
+
+    # This is a singleton class; only one instance of MppaDevice will ever be created.
+    _instance = None
+
+    def __new__(cls, *args: Any, **kwargs: Any) -> "MppaDevice":
+        if cls._instance is None:
+            cls._instance = super(MppaDevice, cls).__new__(cls)
+            cls._instance.__init_once__(*args)
+        return cls._instance
+
+    def __init__(self, config: MppaConfig | None = None):
+        try:
+            import mlir_mppa
+        except ImportError:
+            raise ImportError(
+                "mlir_mppa is not installed but is required for MPPA target"
+            )
+        try:
+            self._csw_path = os.environ["KALRAY_TOOLCHAIN_DIR"]
+        except KeyError:
+            raise KeyError(
+                "Please source the Kalray Accesscore Toolchain: https://www.kalrayinc.com/products/software/"
+            )
+        self._mlir_mppa_path = mlir_mppa.__path__[0]
+        if (config is not None) and (config != self.config):
+            raise ValueError(
+                "MppaDevice already initialized with a different configuration"
+            )
+
+    def __init_once__(self, config: MppaConfig | None = None):
+        if config is None:
+            config = MppaConfig()
+        self.config: MppaConfig = config
+        self.lib_loader: LibLoader | None = None
+        self.mppa_initialized: bool = False
+        self.loaded_kernels: dict[Module, LibLoader] = {}
+        self.calls_counter: int = 0
+        self.need_rebuild: bool = False
+
+    def __build_runtime_lib(self) -> LibLoader:
+        os.system("mkdir -p " + self.config.work_dir)
+        build_subdir = self.config.work_dir + "/mppa_management"
+        os.system("mkdir -p " + build_subdir)
+        if self.config.platform in ["iss", "qemu"]:
+            os.environ["OMP_MPPA_FIRMWARE_NAME"] = self.config.firmware
+            os.environ["MPPA_RPROC_PLATFORM_MODE"] = "sim"
+            os.environ["MPPA_RPROC_SIM_PATH"] = self.config.work_dir + "/mymppa"
+        if self.need_rebuild:
+            os.system("rm -r " + build_subdir + "/*")
+            self.need_rebuild = False
+        return _compile_runtime_lib(self)
+
+    def _insert_mock_tracepoints(self):
+        assert self.lib_loader is not None
+        kernel_fn = getattr(self.lib_loader.lib, "mppa_insert_mock_tracepoints")
+        kernel_fn()
+
+    def __del__(self):
+        if self.mppa_initialized:
+            self.deinit_device()
+        if self.lib_loader is not None:
+            self.lib_loader.close()
+            self.lib_loader = None
+        self._instance = None
+
+    @override
+    def detect_accelerator(self) -> bool:
+        o = subprocess.run(
+            ["kvx-board-diag", "--list-board"], capture_output=True, text=True
+        )
+        if "No Available board" in o.stdout:
+            return False
+        return True
+
+    @override
+    def target_name(self) -> str:
+        return "mppa"
+
+    @override
+    def device_name(self) -> str:
+        return "k300"
+
+    @override
+    def device_arch(self) -> str:
+        return "kv3-2"
+
+    @override
+    def device_id(self) -> int:
+        return 0  # TODO: Allow multiple mppa per machine (e.g. TC4)
+
+    @override
+    def init_device(self) -> None:
+        """Pre-Init Mppa-Offload, which takes around 3 secondes"""
+        if self.mppa_initialized:
+            return
+        if self.lib_loader is None:
+            self.lib_loader = self.__build_runtime_lib()
+        assert self.lib_loader is not None
+        if self.config.verbose:
+            print("(Mppa Pre-Init)")
+        os.environ["MLIR_MPPA_FIRMWARE_NAME"] = self.config.firmware
+        # prepare qemu/iss
+        if self.config.platform in ["iss", "qemu"]:
+            os.system("mkdir -p " + self.config.work_dir + "/mymppa")
+            os.environ["OMP_MPPA_FIRMWARE_NAME"] = self.config.firmware
+            os.environ["MPPA_RPROC_PLATFORM_MODE"] = "sim"
+            os.environ["MPPA_RPROC_SIM_PATH"] = self.config.work_dir + "/mymppa"
+        if self.config.platform == "iss":
+            if self.config.verbose:
+                print("(Launching ISS)")
+            subprocess.Popen(
+                "kvx-cluster --disable-cache --march="
+                + self.config.arch
+                + " --no-load-elf --sim-server=SOCKET --mmap --mppa-wdir="
+                + self.config.work_dir
+                + "/mymppa",
+                shell=True,
+            )
+        elif self.config.platform == "qemu":
+            if self.config.verbose:
+                print("(Launching Qemu)")
+            subprocess.Popen(
+                "kvx-qemu-offload-bridge --arch "
+                + self.config.arch
+                + " --work-dir "
+                + self.config.work_dir
+                + "/mymppa",
+                shell=True,
+            )
+        # set env variables for traces
+        if self.config.mppa_trace_enable:
+            if self.config.verbose:
+                print("(Using Mppa traces)")
+                if self.config.platform in ["iss", "qemu"]:
+                    print(
+                        "[Warning: Mppa traces are enabled, ISS/Qemu cannot handle them]"
+                    )
+            if self.config.mppa_trace_use_syscall:
+                os.environ["MPPA_ENVP"] = (
+                    "MPPA_TRACE_ENABLE_META=1 MPPA_TRACE_USE_SYSCALL=1"
+                )
+                if self.config.verbose:
+                    print(
+                        "[Warning: Mppa traces are enabled using syscalls, please consider using hardware acquisition if the overhead is too high]"
+                    )
+            else:
+                os.environ["MPPA_ENVP"] = "MPPA_TRACE_ENABLE_META=1"
+        preinit_fn = getattr(self.lib_loader.lib, "mppa_init_device")
+        preinit_fn.restype = ctypes.c_bool
+        if not preinit_fn():
+            raise Exception("Failed to pre-init Mppa-Offload")
+        self.mppa_initialized = True
+
+    @override
+    def deinit_device(self) -> None:
+        """De-Init Mppa-Offload"""
+        assert self.lib_loader is not None
+        if not self.mppa_initialized:
+            return
+        remaining_modules = list(self.loaded_kernels.keys())
+        for module in remaining_modules:
+            self.unload_module(module)
+        if self.config.verbose:
+            print("(Mppa De-Init)")
+        deinit_fn = getattr(self.lib_loader.lib, "mppa_deinit_device")
+        deinit_fn.restype = ctypes.c_bool
+        if not deinit_fn():
+            raise Exception("Failed to de-init Mppa-Offload")
+        self.mppa_initialized = False
+
+    @override
+    def load_module(self, module: Module) -> None:
+        """Add a new loaded kernel in the cache"""
+        if not self.mppa_initialized:
+            self.init_device()
+        assert self.lib_loader is not None
+        if len(self.loaded_kernels) == MAX_NB_LOADED_KERNELS:
+            if self.config.verbose:
+                print(
+                    "(maximum number of loaded kernels exceeded, removing the last recently used)"
+                )
+            assert False, "Maximum number of loaded kernels exceeded"
+            # FIXME
+        # Init if not already done
+        self.init_device()
+        # Add new kernel
+        if self.config.verbose:
+            print("(Loading kernel: " + module.name + ")")
+        libloader = LibLoader(str(Path(module.file_name).absolute()))
+        # Pass the context created during pre-init
+        get_mppa_common_structures_fn = getattr(
+            self.lib_loader.lib, "mppa_get_common_structures"
+        )
+        get_mppa_common_structures_fn.restype = ctypes.c_void_p
+        mppa_common_structures = get_mppa_common_structures_fn()
+        set_mppa_common_structures_fn = getattr(
+            libloader.lib, "set_mppa_common_structures"
+        )
+        set_mppa_common_structures_fn.argtypes = [ctypes.c_void_p]
+        set_mppa_common_structures_fn(mppa_common_structures)
+        # Load kernel
+        load_kernel_fn = getattr(libloader.lib, "load_kernel")
+        load_kernel_fn()
+        self.loaded_kernels[module] = libloader
+
+    @override
+    def get_module_function(self, module: Module, function_name: str) -> Callable:
+        if not self.mppa_initialized:
+            self.init_device()
+        if module not in self.loaded_kernels.keys():
+            raise Exception("Kernel is not loaded")
+        func = getattr(self.loaded_kernels[module].lib, function_name)
+        assert func is not None, (
+            f"Cannot find symbol {function_name} in lib {module.file_name}"
+        )
+        return func
+
+    @override
+    def unload_module(self, module: Module) -> None:
+        """Remove a loaded kernel from the cache"""
+        if not self.mppa_initialized:
+            self.init_device()
+        if module not in self.loaded_kernels.keys():
+            raise Exception("Kernel is not loaded")
+        self.loaded_kernels[module].close()
+        self.loaded_kernels.pop(module)
+
+    @override
+    def memory_allocate(self, size_bytes: int) -> Any:
+        assert self.lib_loader is not None
+        # Create a memory handle
+        create_memory_handle_fn = getattr(
+            self.lib_loader.lib, "mppa_create_memory_handle"
+        )
+        create_memory_handle_fn.restype = ctypes.c_void_p
+        memory_handle = create_memory_handle_fn()
+        # Allocate memory
+        allocate_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_allocate")
+        allocate_memory_fn.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
+        allocate_memory_fn.restype = ctypes.c_bool
+        if not allocate_memory_fn(memory_handle, size_bytes):
+            raise Exception("Failed to allocate memory")
+        return memory_handle
+
+    @override
+    def memory_free(self, handle: Any) -> None:
+        if not self.mppa_initialized:
+            self.init_device()
+        assert self.lib_loader is not None
+        # Free memory
+        free_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_free")
+        free_memory_fn.argtypes = [ctypes.c_void_p]
+        free_memory_fn.restype = ctypes.c_bool
+        if not free_memory_fn(handle):
+            raise Exception("Failed to free memory")
+        # Destroy memory handle
+        destroy_memory_handle_fn = getattr(
+            self.lib_loader.lib, "mppa_destroy_memory_handle"
+        )
+        destroy_memory_handle_fn.argtypes = [ctypes.c_void_p]
+        destroy_memory_handle_fn.restype = ctypes.c_bool
+        if not destroy_memory_handle_fn(handle):
+            raise Exception("Failed to destroy memory handle")
+
+    @override
+    def memory_copy_to(
+        self, acc_handle: Any, src: ctypes.c_void_p, size_bytes: int
+    ) -> None:
+        if not self.mppa_initialized:
+            self.init_device()
+        assert self.lib_loader is not None
+        # Copy memory to accelerator device
+        copy_to_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_copy_to")
+        copy_to_memory_fn.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
+        copy_to_memory_fn.restype = ctypes.c_bool
+        if not copy_to_memory_fn(acc_handle, src, size_bytes):
+            raise Exception("Failed to copy memory to accelerator device")
+
+    @override
+    def memory_copy_from(
+        self, acc_handle: Any, dst: ctypes.c_void_p, size_bytes: int
+    ) -> None:
+        if not self.mppa_initialized:
+            self.init_device()
+        assert self.lib_loader is not None
+        # Copy memory from accelerator device to host
+        copy_from_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_copy_from")
+        copy_from_memory_fn.argtypes = [
+            ctypes.c_void_p,
+            ctypes.c_void_p,
+            ctypes.c_size_t,
+        ]
+        copy_from_memory_fn.restype = ctypes.c_bool
+        if not copy_from_memory_fn(acc_handle, dst, size_bytes):
+            raise Exception("Failed to copy memory from accelerator device to host")
+
+    @override
+    def memory_fill_zero(self, acc_handle: Any, size_bytes: int) -> None:
+        if not self.mppa_initialized:
+            self.init_device()
+        assert self.lib_loader is not None
+        fill_zero_memory_fn = getattr(self.lib_loader.lib, "mppa_memory_fill_zero")
+        fill_zero_memory_fn.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
+        fill_zero_memory_fn.restype = ctypes.c_bool
+        if not fill_zero_memory_fn(acc_handle, size_bytes):
+            raise Exception("Failed to fill memory with zeros")
+
+    @override
+    def memory_data_pointer(self, acc_handle: Any) -> ctypes.c_void_p:
+        if not self.mppa_initialized:
+            self.init_device()
+        assert self.lib_loader is not None
+        # Get data pointer
+        get_data_pointer_fn = getattr(self.lib_loader.lib, "mppa_memory_data_pointer")
+        get_data_pointer_fn.argtypes = [ctypes.c_void_p]
+        get_data_pointer_fn.restype = ctypes.c_void_p
+        return get_data_pointer_fn(acc_handle)
+
+    @override
+    def evaluate(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        nargs: int,
+        cfunc: CFunc,
+        args: Any,
+    ) -> None:
+        HostRuntime.get().evaluate(
+            results,
+            repeat,
+            number,
+            nargs,
+            cfunc,
+            args,
+        )
+
+    @override
+    def evaluate_perf(
+        self,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args_tuples: list[Any],
+    ) -> list[float]:
+        return HostRuntime.get().evaluate_perf(
+            pmu_events,
+            repeat,
+            number,
+            min_repeat_ms,
+            cfunc,
+            args_tuples,
+        )
+
+    @override
+    def evaluate_packed(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        raise NotImplementedError("evaluate_packed is not implemented for MPPA device")
+
+    @override
+    def evaluate_packed_perf(
+        self,
+        results: Any,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        raise NotImplementedError(
+            "evaluate_packed_perf is not implemented for MPPA device"
+        )
+
+    @override
+    def cndarray_new(
+        self,
+        ndim: int,
+        shape: Any,
+        dtype: DLDataType,
+        device: DLDevice,
+    ) -> Any:
+        return HostRuntime.get().cndarray_new(ndim, shape, dtype, device)
+
+    @override
+    def cndarray_del(self, handle: Any) -> None:
+        HostRuntime.get().cndarray_del(handle)
+
+    @override
+    def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None:
+        HostRuntime.get().cndarray_copy_from_data(handle, data_handle)
+
+    @override
+    def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None:
+        HostRuntime.get().cndarray_copy_to_data(handle, data_handle)
+
+    @override
+    def evaluate_flops(self, dtype_name: str | bytes) -> float:
+        return HostRuntime.get().evaluate_flops(dtype_name)
diff --git a/src/xtc/runtimes/accelerator/mppa/__init__.py b/src/xtc/runtimes/accelerator/mppa/__init__.py
new file mode 100644
index 000000000..4fdcb768f
--- /dev/null
+++ b/src/xtc/runtimes/accelerator/mppa/__init__.py
@@ -0,0 +1,8 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from .config import MppaConfig
+from .MppaDevice import MppaDevice
+
+__all__ = ["MppaConfig", "MppaDevice"]
diff --git a/src/xtc/runtimes/accelerator/mppa/config.py b/src/xtc/runtimes/accelerator/mppa/config.py
new file mode 100644
index 000000000..5e7b1a3c1
--- /dev/null
+++ b/src/xtc/runtimes/accelerator/mppa/config.py
@@ -0,0 +1,161 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+import os
+from typing import Any
+from typing_extensions import override
+
+from xtc.backends.mlir.MlirConfig import MlirConfig
+
+VALID_PLATFORMS = ["hw", "iss", "qemu"]
+VALID_ARCHS = ["kv3-1", "kv3-2"]
+VALID_FIRMWARES = ["ocl_fw_l1.elf"]  # TODO add other firmwares
+
+DEFAULT_WORK_DIR = "/tmp/" + os.getlogin() + "/mlir_mppa"
+DEFAULT_PLATFORM = "hw"
+DEFAULT_ARCH = "kv3-2"
+DEFAULT_FIRMWARE = "ocl_fw_l1.elf"
+DEFAULT_VERBOSE = True
+DEFAULT_BUILD_VERBOSE = 0
+DEFAULT_BENCHMARK = False
+DEFAULT_MPPA_TRACE_ENABLE = False
+DEFAULT_MPPA_TRACE_USE_SYSCALL = True
+
+
+class MppaConfig:
+    """A class to gather all configs"""
+
+    def __init__(self, mlir_config: MlirConfig | None = None):
+        if mlir_config is None:
+            mlir_config = MlirConfig()
+        # Default Configuration
+        self.work_dir: str = DEFAULT_WORK_DIR
+        self.platform: str = get_platform()
+        self.arch: str = DEFAULT_ARCH
+        self.firmware: str = DEFAULT_FIRMWARE
+        self.verbose: bool = mlir_config.debug
+        self.build_verbose: int = mlir_config.debug
+        self.benchmark: bool = DEFAULT_BENCHMARK
+        self.mppa_trace_enable: bool = DEFAULT_MPPA_TRACE_ENABLE
+        self.mppa_trace_use_syscall: bool = DEFAULT_MPPA_TRACE_USE_SYSCALL
+        self.mlir_config: MlirConfig = mlir_config
+        # Read from env
+        self.set_platform(get_platform())
+        self.set_benchmark(is_benchmark())
+        self.set_mppa_trace_enable(mppa_trace_enable())
+        self.set_mppa_trace_use_syscall(mppa_trace_use_syscall())
+        if os.getenv("CLEAN_WORK_DIR", "0") in ["1", "true", "True"]:
+            self.clean_work_dir()
+
+    def set_work_dir(self, work_dir: str) -> None:
+        if self.work_dir != "" and self.work_dir != "/":
+            self.work_dir = work_dir
+
+    def set_platform(self, platform: str) -> None:
+        if platform in VALID_PLATFORMS:
+            self.platform = platform
+
+    def set_arch(self, arch: str) -> None:
+        if arch in VALID_ARCHS:
+            self.arch = arch
+
+    def set_firmware(self, firmware: str) -> None:
+        if firmware in VALID_FIRMWARES:
+            self.firmware = firmware
+
+    def set_verbose(self, verbose: bool) -> None:
+        self.verbose = verbose
+
+    def set_build_verbose(self, build_verbose: int) -> None:
+        self.build_verbose = build_verbose
+
+    def set_benchmark(self, benchmark: bool) -> None:
+        self.benchmark = benchmark
+
+    def set_mppa_trace_enable(self, mppa_trace_enable: bool) -> None:
+        self.mppa_trace_enable = mppa_trace_enable
+
+    def set_mppa_trace_use_syscall(self, mppa_trace_use_syscall: bool) -> None:
+        self.mppa_trace_use_syscall = mppa_trace_use_syscall
+
+    def clean_work_dir(self):
+        if os.path.exists(self.work_dir):
+            os.system("rm -r " + self.work_dir)
+
+    @override
+    def __str__(self) -> str:
+        s = "Mppa configuration:\n"
+        s += " - work_dir: " + self.work_dir + "\n"
+        s += " - platform: " + self.platform + "\n"
+        s += " - arch: " + self.arch + "\n"
+        s += " - firmware: " + self.firmware + "\n"
+        s += " - verbose: " + str(self.verbose) + "\n"
+        s += " - build_verbose: " + str(self.build_verbose) + "\n"
+        s += " - benchmark: " + str(self.benchmark)
+        s += " - mppa_trace_enable: " + str(self.mppa_trace_enable) + "\n"
+        s += " - mppa_trace_use_syscall: " + str(self.mppa_trace_use_syscall) + "\n"
+        return s
+
+    @override
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, MppaConfig):
+            return False
+        return (
+            self.work_dir == other.work_dir
+            and self.platform == other.platform
+            and self.arch == other.arch
+            and self.firmware == other.firmware
+            and self.verbose == other.verbose
+            and self.build_verbose == other.build_verbose
+            and self.benchmark == other.benchmark
+            and self.mppa_trace_enable == other.mppa_trace_enable
+            and self.mppa_trace_use_syscall == other.mppa_trace_use_syscall
+        )
+
+
+# Creation of a MppaConfig from env
+
+
+def get_platform() -> str:
+    platform = os.getenv("PLATFORM")
+    if platform is not None:
+        if platform in VALID_PLATFORMS:
+            return platform
+        else:
+            print(f"\033[91mUnknown platform: {platform}\033[0m")
+            exit(1)
+    return DEFAULT_PLATFORM
+
+
+def is_benchmark():
+    if "BENCHMARK" in os.environ:
+        if os.getenv("BENCHMARK") in ["1", "true", "True"]:
+            return True
+        else:
+            return False
+    return DEFAULT_BENCHMARK
+
+
+def mppa_trace_enable():
+    if "MPPA_TRACE_ENABLE" in os.environ:
+        if os.getenv("MPPA_TRACE_ENABLE") in ["1", "true", "True"]:
+            return True
+        else:
+            return False
+    if is_benchmark():
+        return True
+    return DEFAULT_MPPA_TRACE_ENABLE
+
+
+def mppa_trace_use_syscall():
+    if "MPPA_TRACE_USE_SYSCALL" in os.environ:
+        if os.getenv("MPPA_TRACE_USE_SYSCALL") in ["1", "true", "True"]:
+            return True
+        else:
+            return False
+    return DEFAULT_MPPA_TRACE_USE_SYSCALL
+
+
+def mppa_trace_use_oculink():
+    return not mppa_trace_use_syscall()
diff --git a/src/xtc/runtimes/gpu/runtime.py b/src/xtc/runtimes/gpu/runtime.py
deleted file mode 100644
index 92a09d069..000000000
--- a/src/xtc/runtimes/gpu/runtime.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2024-2026 The XTC Project Authors
-#
-import logging
-
-from xtc.runtimes.host.runtime import runtime_funcs, resolve_runtime, RuntimeType
-
-logger = logging.getLogger(__name__)
-
-# Can be set to True for RUNTIME_DEBUG
-RUNTIME_DEBUG = False
-
-# GPU Runtime
-
-
-def type() -> RuntimeType:
-    return RuntimeType.GPU
-
-
-def __getattr__(x: str):
-    if x in runtime_funcs:
-        entries = resolve_runtime(RuntimeType.GPU)
-        assert entries is not None
-        return entries[x]
-    raise AttributeError(f"undefined runtime function: {x}")
diff --git a/src/xtc/runtimes/host/HostRuntime.py b/src/xtc/runtimes/host/HostRuntime.py
new file mode 100644
index 000000000..c4a6440d6
--- /dev/null
+++ b/src/xtc/runtimes/host/HostRuntime.py
@@ -0,0 +1,210 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+import ctypes
+import logging
+from typing import Any, Callable
+from typing_extensions import override
+
+from xtc.runtimes.types.dlpack import DLDevice, DLDataType
+
+from xtc.utils.cfunc import CFunc, _str_list_to_c, _c_ascii_str
+from xtc.itf.runtime.common import CommonRuntimeInterface
+
+from .runtime import runtime_funcs, resolve_runtime, RuntimeType
+
+__all__ = ["HostRuntime"]
+
+logger = logging.getLogger(__name__)
+
+# Can be set to True for RUNTIME_DEBUG
+RUNTIME_DEBUG = False
+
+
+class HostRuntime(CommonRuntimeInterface):
+    """A class for Host runtime"""
+
+    # This is a singleton class; only one instance of HostRuntime will ever be created.
+    _instance = None
+
+    def __new__(cls, *args: Any, **kwargs: Any) -> "HostRuntime":
+        if cls._instance is None:
+            cls._instance = super(HostRuntime, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        pass
+
+    def __del__(self):
+        self._instance = None
+
+    def __get_runtime_func(self, name: str) -> Callable:
+        if name in runtime_funcs:
+            entries = resolve_runtime(RuntimeType.HOST)
+            assert entries is not None
+            return entries[name]
+        raise AttributeError(f"undefined runtime function: {name}")
+
+    def __getattr__(self, name: str) -> Callable:
+        return self.__get_runtime_func(name)
+
+    @classmethod
+    def get(cls) -> "HostRuntime":
+        if cls._instance is None:
+            cls._instance = HostRuntime()
+        return cls._instance
+
+    @override
+    def target_name(self) -> str:
+        return "host"
+
+    @override
+    def device_name(self) -> str:
+        return "host"
+
+    @override
+    def device_arch(self) -> str:
+        return "host"
+
+    @override
+    def device_id(self) -> int:
+        return 0
+
+    @override
+    def evaluate(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        nargs: int,
+        cfunc: CFunc,
+        args: Any,
+    ) -> None:
+        self.__get_runtime_func("evaluate")(
+            ctypes.cast(results, ctypes.POINTER(ctypes.c_double)),
+            ctypes.c_int(repeat),
+            ctypes.c_int(number),
+            ctypes.c_int(nargs),
+            ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
+            ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)),
+        )
+
+    @override
+    def evaluate_perf(
+        self,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args_tuples: list[Any],
+    ) -> list[float]:
+        args_array = (ctypes.c_voidp * len(args_tuples))(
+            *[arg[0] for arg in args_tuples]
+        )
+        values_num = 1
+        if len(pmu_events) > 0:
+            values_num = len(pmu_events)
+            # FIXME check if the PMU events are supported by the target
+        results_array = (ctypes.c_double * (repeat * values_num))()
+        self.__get_runtime_func("evaluate_perf")(
+            ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)),
+            ctypes.c_int(len(pmu_events)),
+            _str_list_to_c(pmu_events),
+            ctypes.c_int(repeat),
+            ctypes.c_int(number),
+            ctypes.c_int(min_repeat_ms),
+            ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
+            ctypes.cast(args_array, ctypes.POINTER(ctypes.c_voidp)),
+            ctypes.c_int(len(args_tuples)),
+        )
+        return [float(x) for x in results_array]
+
+    @override
+    def evaluate_packed(
+        self,
+        results: Any,
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        self.__get_runtime_func("evaluate_packed")(
+            ctypes.cast(results, ctypes.POINTER(ctypes.c_double)),
+            ctypes.c_int(repeat),
+            ctypes.c_int(number),
+            ctypes.c_int(min_repeat_ms),
+            ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
+            ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)),
+            ctypes.cast(codes, ctypes.POINTER(ctypes.c_int)),
+            ctypes.c_int(nargs),
+        )
+
+    @override
+    def evaluate_packed_perf(
+        self,
+        results: Any,
+        pmu_events: list[str],
+        repeat: int,
+        number: int,
+        min_repeat_ms: int,
+        cfunc: CFunc,
+        args: Any,
+        codes: Any,
+        nargs: int,
+    ) -> None:
+        self.__get_runtime_func("evaluate_packed_perf")(
+            ctypes.cast(results, ctypes.POINTER(ctypes.c_double)),
+            ctypes.c_int(len(pmu_events)),
+            _str_list_to_c(pmu_events),
+            ctypes.c_int(repeat),
+            ctypes.c_int(number),
+            ctypes.c_int(min_repeat_ms),
+            ctypes.cast(cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
+            ctypes.cast(args, ctypes.POINTER(ctypes.c_voidp)),
+            ctypes.cast(codes, ctypes.POINTER(ctypes.c_int)),
+            ctypes.c_int(nargs),
+        )
+
+    @override
+    def cndarray_new(
+        self,
+        ndim: int,
+        shape: Any,
+        dtype: DLDataType,
+        device: DLDevice,
+    ) -> Any:
+        # Convert shape if it's a list/tuple to ctypes array
+        if isinstance(shape, (list, tuple)):
+            shape_array = (ctypes.c_int64 * len(shape))(*shape)
+            shape = ctypes.cast(shape_array, ctypes.POINTER(ctypes.c_int64))
+        return self.__get_runtime_func("cndarray_new")(
+            ctypes.c_int32(ndim),
+            shape,
+            dtype,
+            device,
+        )
+
+    @override
+    def cndarray_del(self, handle: Any) -> None:
+        self.__get_runtime_func("cndarray_del")(handle)
+
+    @override
+    def cndarray_copy_from_data(self, handle: Any, data_handle: Any) -> None:
+        self.__get_runtime_func("cndarray_copy_from_data")(handle, data_handle)
+
+    @override
+    def cndarray_copy_to_data(self, handle: Any, data_handle: Any) -> None:
+        self.__get_runtime_func("cndarray_copy_to_data")(handle, data_handle)
+
+    @override
+    def evaluate_flops(self, dtype_name: str | bytes) -> float:
+        return float(
+            self.__get_runtime_func("evaluate_flops")(
+                _c_ascii_str.from_param(dtype_name)
+            )
+        )
diff --git a/src/xtc/runtimes/host/__init__.py b/src/xtc/runtimes/host/__init__.py
index 4a40722d1..ab0e1dd81 100644
--- a/src/xtc/runtimes/host/__init__.py
+++ b/src/xtc/runtimes/host/__init__.py
@@ -2,3 +2,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024-2026 The XTC Project Authors
 #
+from .HostRuntime import HostRuntime
+
+__all__ = ["HostRuntime"]
diff --git a/src/xtc/runtimes/host/evaluate.py b/src/xtc/runtimes/host/evaluate.py
deleted file mode 100644
index c15e3f96a..000000000
--- a/src/xtc/runtimes/host/evaluate.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2024-2026 The XTC Project Authors
-#
-from typing import Any
-from types import ModuleType
-import numpy as np
-import numpy.typing
-from pathlib import Path
-
-from xtc.utils.loader import LibLoader
-from xtc.runtimes.types.ndarray import NDArray
-
-from .evaluator import Executor, Evaluator
-
-
-def load_and_evaluate(
-    runtime: ModuleType,
-    module_file: str,
-    module_name: str,
-    payload_name: str,
-    **kwargs: Any,
-) -> tuple[list[float], int, str]:
-    bare_ptr = kwargs.get("bare_ptr", True)
-    dll = str(Path(module_file).absolute())
-    sym = payload_name
-    parameters: tuple[list[NDArray], list[NDArray]] = kwargs.get("parameters", [])
-    ref_outputs: list[numpy.typing.NDArray] = kwargs.get("ref_outputs", [])
-    validate = kwargs.get("validate", False)
-    repeat = kwargs.get("repeat", 1)
-    number = kwargs.get("number", 1)
-    min_repeat_ms = kwargs.get("min_repeat_ms", 0)
-    pmu_counters = kwargs.get("pmu_counters", [])
-    with LibLoader(dll) as lib:
-        func = getattr(lib, sym)
-        assert func is not None, f"Cannot find symbol {sym} in lib {dll}"
-        func.packed = not bare_ptr
-        if validate:
-            exec_func = Executor(func)
-            exec_func(*parameters[0], *parameters[1])
-            for out_ref, out in zip(
-                ref_outputs, [out.numpy() for out in parameters[1]]
-            ):
-                if not np.allclose(out_ref, out):
-                    return [], 1, "Error in validation: outputs differ"
-        eval_func = Evaluator(
-            func,
-            runtime,
-            repeat=repeat,
-            min_repeat_ms=min_repeat_ms,
-            number=number,
-            pmu_counters=pmu_counters,
-        )
-        results = eval_func(*parameters[0], *parameters[1])
-    return results, 0, ""
-
-
-def load_and_execute(
-    runtime: ModuleType,
-    module_file: str,
-    module_name: str,
-    payload_name: str,
-    **kwargs: Any,
-) -> int:
-    _, code, _ = load_and_evaluate(
-        runtime,
-        module_file,
-        module_name,
-        payload_name,
-        repeat=1,
-        min_repeat_ms=0,
-        number=1,
-        **kwargs,
-    )
-    return code
diff --git a/src/xtc/runtimes/host/evaluator.py b/src/xtc/runtimes/host/evaluator.py
deleted file mode 100644
index 4c49c760a..000000000
--- a/src/xtc/runtimes/host/evaluator.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2024-2026 The XTC Project Authors
-#
-from typing import Any
-from types import ModuleType
-import ctypes
-
-from xtc.runtimes.host.runtime import RuntimeType
-
-__all__ = [
-    "Evaluator",
-    "Executor",
-]
-
-
-class ArgTypeCode:
-    INT = 0
-    HANDLE = 3
-    NDARRAY_HANDLE = 13
-
-
-CArgCode = ctypes.c_int
-
-
-class CArgValue(ctypes.Union):
-    _fields_ = [
-        ("v_int64", ctypes.c_int64),
-        ("v_float64", ctypes.c_double),
-        ("v_handle", ctypes.c_void_p),
-        ("v_str", ctypes.c_char_p),
-    ]
-
-
-class CRetValue(CArgValue):
-    pass
-
-
-CPackedFunc = ctypes.CFUNCTYPE(
-    ctypes.c_int,
-    ctypes.POINTER(CArgValue),
-    ctypes.POINTER(CArgCode),
-    ctypes.c_int,
-    ctypes.POINTER(CRetValue),
-    ctypes.POINTER(CArgCode),
-)
-
-
-class CFunc:
-    def __init__(self, f: Any, packed: bool = False) -> None:
-        self.handle = f
-        self.is_packed = packed or (
-            hasattr(self.handle, "packed") and self.handle.packed
-        )
-
-    def arg_tuple(self, arg: Any) -> Any:
-        if arg.__class__.__name__ == "ndarray":  # Numpy Array
-            assert not self.is_packed
-            return (arg.ctypes.data_as(ctypes.c_voidp), ArgTypeCode.HANDLE)
-        elif arg.__class__.__name__ == "NDArray":  # TVM NDArray or our NDArray
-            if self.is_packed:
-                return (
-                    CArgValue(v_handle=ctypes.cast(arg.handle, ctypes.c_void_p)),
-                    ArgTypeCode.NDARRAY_HANDLE,
-                )
-            else:
-                return (
-                    ctypes.cast(arg.handle.contents.dl_tensor.data, ctypes.c_void_p),
-                    ArgTypeCode.HANDLE,
-                )
-        else:
-            assert 0, f"Unsupported argument class: {arg.__class__.__name__}"
-
-    def args_tuples(self, args: Any) -> list[Any]:
-        return [self.arg_tuple(arg) for arg in args]
-
-    def __call__(self, *args: Any):
-        args_tuples = self.args_tuples(args)
-        if self.is_packed:
-            args_array = (CArgValue * len(args_tuples))(
-                *[arg[0] for arg in args_tuples]
-            )
-            args_codes = (CArgCode * len(args_tuples))(*[arg[1] for arg in args_tuples])
-            result_val = CRetValue(0)
-            result_code = CArgCode(ArgTypeCode.INT)
-            res = CPackedFunc(self.handle)(
-                args_array,
-                args_codes,
-                len(args_tuples),
-                ctypes.byref(result_val),
-                ctypes.byref(result_code),
-                ctypes.c_int(len(args_tuples)),
-            )
-            assert res == 0, f"error calling packed function"
-        else:
-            data_args = [arg[0] for arg in args_tuples]
-            self.handle(*data_args)
-
-
-class Evaluator:
-    def __init__(
-        self,
-        f: Any,
-        runtime: ModuleType,
-        repeat: int = 1,
-        number: int = 1,
-        min_repeat_ms: int = 0,
-        pmu_counters: list[str] = [],
-    ) -> None:
-        assert repeat > 0
-        assert number > 0
-        assert min_repeat_ms >= 0
-        self.repeat = repeat
-        self.number = number
-        self.min_repeat_ms = min_repeat_ms
-        self.pmu_counters = pmu_counters
-        self.runtime = runtime
-        self.cfunc = CFunc(f)
-
-    def _str_list_to_c(self, str_list: list[str]) -> Any:
-        return (ctypes.c_char_p * len(str_list))(
-            *[str.encode("utf-8") for str in str_list]
-        )
-
-    def __call__(self, *args: Any) -> list[float]:
-        args_tuples = self.cfunc.args_tuples(args)
-        values_num = 1
-        if len(self.pmu_counters) > 0:
-            values_num = len(self.pmu_counters)
-            if (
-                any(counter.startswith("gpu.") for counter in self.pmu_counters)
-                and self.runtime.type() != RuntimeType.GPU
-            ):
-                raise ValueError(
-                    "GPU PMU counters are not requested but target is not a GPU."
-                )
-        results_array = (ctypes.c_double * (self.repeat * values_num))()
-        if self.cfunc.is_packed:
-            args_array_packed = (CArgValue * len(args_tuples))(
-                *[arg[0] for arg in args_tuples]
-            )
-            args_codes_packed = (CArgCode * len(args_tuples))(
-                *[arg[1] for arg in args_tuples]
-            )
-            self.runtime.evaluate_packed_perf(
-                ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)),
-                ctypes.c_int(len(self.pmu_counters)),
-                self._str_list_to_c(self.pmu_counters),
-                ctypes.c_int(self.repeat),
-                ctypes.c_int(self.number),
-                ctypes.c_int(self.min_repeat_ms),
-                ctypes.cast(self.cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
-                ctypes.cast(args_array_packed, ctypes.POINTER(ctypes.c_voidp)),
-                ctypes.cast(args_codes_packed, ctypes.POINTER(ctypes.c_int)),
-                ctypes.c_int(len(args_tuples)),
-            )
-        else:
-            args_array = (ctypes.c_voidp * len(args_tuples))(
-                *[arg[0] for arg in args_tuples]
-            )
-            self.runtime.evaluate_perf(
-                ctypes.cast(results_array, ctypes.POINTER(ctypes.c_double)),
-                ctypes.c_int(len(self.pmu_counters)),
-                self._str_list_to_c(self.pmu_counters),
-                ctypes.c_int(self.repeat),
-                ctypes.c_int(self.number),
-                ctypes.c_int(self.min_repeat_ms),
-                ctypes.cast(self.cfunc.handle, ctypes.CFUNCTYPE(ctypes.c_voidp)),
-                ctypes.cast(args_array, ctypes.POINTER(ctypes.c_voidp)),
-                ctypes.c_int(len(args_tuples)),
-            )
-        return [float(x) for x in results_array]
-
-
-class Executor:
-    def __init__(self, f: Any) -> None:
-        self.func = CFunc(f)
-
-    def __call__(self, *args: Any) -> None:
-        self.func(*args)
diff --git a/src/xtc/runtimes/host/runtime.py b/src/xtc/runtimes/host/runtime.py
index 17cf41db4..b1976cce6 100644
--- a/src/xtc/runtimes/host/runtime.py
+++ b/src/xtc/runtimes/host/runtime.py
@@ -175,7 +175,7 @@ def _compile_runtime(out_dll: str, tdir: str, runtime_type: RuntimeType):
     for i, file in enumerate(src_files):
         cmd = (
             "cc -c -O2 -march=native -fPIC "
-            f"-I{src_dir} {debug_opts} {pfm_opts} {gpu_opts} -I{src_dir}/../gpu "
+            f"-I{src_dir} {debug_opts} {pfm_opts} {gpu_opts} -I{src_dir}/../accelerator/gpu "
             f"-o {obj_files[i]} {file}"
         )
         logger.debug("Compiling runtime: %s", cmd)
@@ -207,7 +207,7 @@ def _compile_runtime_gpu_extension(out_lib: str, tdir: str):
         "perf_event_gpu.cpp",
     ]
     top_dir = Path(__file__).parents[2]
-    src_dir = top_dir / "csrcs" / "runtimes" / "gpu"
+    src_dir = top_dir / "csrcs" / "runtimes" / "accelerator" / "gpu"
     src_files = [f"{src_dir}/{file}" for file in files]
 
     # Compile
@@ -215,7 +215,7 @@ def _compile_runtime_gpu_extension(out_lib: str, tdir: str):
     obj_file = f"{tdir}/perf_event_gpu.o"
     cmd = (
         "c++ -c -O2 -march=native -fPIC "
-        f"-I{src_dir} {debug_opts} -I{src_dir}/../host "
+        f"-I{src_dir} {debug_opts} -I{src_dir}/../../host "
         f"-I{cuda_install_dir}/include "
         f"-o {obj_file} {' '.join(src_files)}"
     )
@@ -274,18 +274,3 @@ def resolve_runtime(runtime_type: RuntimeType):
             )
         _runtime_entries[runtime_type.value] = entries
         return _runtime_entries[runtime_type.value]
-
-
-# Host Runtime
-
-
-def type() -> RuntimeType:
-    return RuntimeType.HOST
-
-
-def __getattr__(x: str):
-    if x in runtime_funcs:
-        entries = resolve_runtime(RuntimeType.HOST)
-        assert entries is not None
-        return entries[x]
-    raise AttributeError(f"undefined runtime function: {x}")
diff --git a/src/xtc/runtimes/types/ndarray.py b/src/xtc/runtimes/types/ndarray.py
index 0148d807c..351f4ecd1 100644
--- a/src/xtc/runtimes/types/ndarray.py
+++ b/src/xtc/runtimes/types/ndarray.py
@@ -5,6 +5,7 @@
 from typing import Any
 import ctypes
 import numpy as np
+from enum import Enum
 
 __all__ = [
     "NDArray",
@@ -12,7 +13,15 @@
 
 from .dlpack import DLDevice, DLDeviceTypeCode, DLDataType, DLDataTypeCode, CNDArray
 
-import xtc.runtimes.host.runtime as runtime
+from xtc.runtimes.host.HostRuntime import HostRuntime
+
+from xtc.itf.runtime.common import CommonRuntimeInterface
+from xtc.itf.runtime.accelerator import AcceleratorDevice
+
+
+class NDArrayLocation(Enum):
+    HOST = 0
+    DEVICE = 1
 
 
 class NDArray:
@@ -30,19 +39,28 @@ class NDArray:
     }
     rev_np_dtype_map: dict[tuple[int, int], str] = {}
 
-    def __init__(self, array: Any) -> None:
+    def __init__(
+        self, array: Any, runtime: CommonRuntimeInterface | None = None
+    ) -> None:
         if not self.rev_np_dtype_map:
             self.rev_np_dtype_map.update(
                 {v: k for k, v in NDArray.np_dtype_map.items()}
             )
 
         self.handle = None
+        self.device_handle = None
+        self.runtime = runtime
+        if self.runtime is None:
+            self.runtime = HostRuntime()
+        self.location = NDArrayLocation.HOST
         if isinstance(array, NDArray):
             raise RuntimeError("TODO: copy from CNDArray not supported yet")
         elif isinstance(array, np.ndarray):
             self._from_numpy(array)
         else:
             assert 0
+        if isinstance(self.runtime, AcceleratorDevice):
+            self._to_device()
 
     def _from_numpy(self, nparray: np.ndarray) -> None:
         assert nparray.flags["C_CONTIGUOUS"]
@@ -64,11 +82,48 @@ def _copy_to_numpy(self, out: np.ndarray) -> np.ndarray:
         return out
 
     def numpy(self, out: np.ndarray | None = None) -> np.ndarray:
+        if self.is_on_device():
+            assert isinstance(self.runtime, AcceleratorDevice)
+            assert self.handle is not None
+            bytes_size = self.size * self.dtype.itemsize
+            self.runtime.memory_copy_from(
+                self.device_handle, self.handle.contents.dl_tensor.data, bytes_size
+            )
         if out is None:
             return self._to_numpy()
         else:
             return self._copy_to_numpy(out)
 
+    def _to_device(self) -> None:
+        assert (
+            isinstance(self.runtime, AcceleratorDevice)
+            and self.location == NDArrayLocation.HOST
+        )
+        assert self.handle is not None
+        bytes_size = self.size * self.dtype.itemsize
+        self.device_handle = self.runtime.memory_allocate(bytes_size)
+        self.runtime.memory_copy_to(
+            self.device_handle, self.handle.contents.dl_tensor.data, bytes_size
+        )
+        self.location = NDArrayLocation.DEVICE
+
+    def _from_device(self) -> None:
+        assert (
+            isinstance(self.runtime, AcceleratorDevice)
+            and self.location == NDArrayLocation.DEVICE
+        )
+        assert self.handle is not None
+        bytes_size = self.size * self.dtype.itemsize
+        self.runtime.memory_copy_from(
+            self.device_handle, self.handle.contents.dl_tensor.data, bytes_size
+        )
+        self.runtime.memory_free(self.device_handle)
+        self.device_handle = None
+        self.location = NDArrayLocation.HOST
+
+    def is_on_device(self) -> bool:
+        return self.location == NDArrayLocation.DEVICE
+
     @property
     def dtype_str(self) -> str:
         assert self.handle is not None
@@ -103,18 +158,21 @@ def size(self) -> int:
     @property
     def data(self) -> Any:
         assert self.handle is not None
+        if self.is_on_device():
+            assert isinstance(self.runtime, AcceleratorDevice)
+            return self.runtime.memory_data_pointer(self.device_handle)
         return self.handle.contents.dl_tensor.data
 
     @classmethod
     def _copy_from(cls, handle: Any, data_handle: Any) -> None:
-        runtime.cndarray_copy_from_data(
+        HostRuntime.get().cndarray_copy_from_data(
             handle,
             data_handle,
         )
 
     @classmethod
     def _copy_to(cls, handle: Any, data_handle: Any) -> None:
-        runtime.cndarray_copy_to_data(
+        HostRuntime.get().cndarray_copy_to_data(
             handle,
             data_handle,
         )
@@ -132,7 +190,7 @@ def _new(
             device = DLDevice(DLDeviceTypeCode.kDLCPU, 0)
         shape_array = (ctypes.c_int64 * len(shape))(*shape)
         dldtype = cls._dldatatype(np_dtype)
-        handle = runtime.cndarray_new(
+        handle = HostRuntime.get().cndarray_new(
             len(shape),
             ctypes.cast(shape_array, ctypes.POINTER(ctypes.c_int64)),
             dldtype,
@@ -145,9 +203,14 @@ def _new(
 
     def __del__(self) -> None:
         if self.handle is not None:
-            runtime.cndarray_del(self.handle)
+            assert self.runtime is not None
+            self.runtime.cndarray_del(self.handle)
             self.handle = None
+        if self.device_handle is not None:
+            assert isinstance(self.runtime, AcceleratorDevice)
+            self.runtime.memory_free(self.device_handle)
+            self.device_handle = None
 
     @classmethod
     def set_alloc_alignment(cls, alignment: int) -> None:
-        runtime.cndarray_set_alloc_alignment(alignment)
+        HostRuntime.get().cndarray_set_alloc_alignment(alignment)
diff --git a/src/xtc/targets/accelerator/gpu/GPUEvaluator.py b/src/xtc/targets/accelerator/gpu/GPUEvaluator.py
new file mode 100644
index 000000000..7da994e54
--- /dev/null
+++ b/src/xtc/targets/accelerator/gpu/GPUEvaluator.py
@@ -0,0 +1,138 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing import Any
+from typing_extensions import override
+
+from xtc.runtimes.accelerator.gpu.GPUDevice import GPUDevice
+import xtc.targets.accelerator.gpu as gpu
+import xtc.itf as itf
+from xtc.utils.evaluation import (
+    ensure_ndarray_parameters,
+    validate_outputs,
+    evaluate_performance,
+    copy_outputs,
+)
+
+__all__ = [
+    "GPUEvaluator",
+    "GPUExecutor",
+]
+
+
+class GPUEvaluator(itf.exec.Evaluator):
+    def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None:
+        self._device = GPUDevice()
+        self._module = module
+        self._repeat = kwargs.get("repeat", 1)
+        self._min_repeat_ms = kwargs.get("min_repeat_ms", 100)
+        self._number = kwargs.get("number", 1)
+        self._validate = kwargs.get("validate", False)
+        self._parameters = kwargs.get("parameters")
+        self._init_zero = kwargs.get("init_zero", False)
+        self._np_inputs_spec = kwargs.get(
+            "np_inputs_spec", self._module._np_inputs_spec
+        )
+        self._np_outputs_spec = kwargs.get(
+            "np_outputs_spec", self._module._np_outputs_spec
+        )
+        self._reference_impl = kwargs.get(
+            "reference_impl", self._module._reference_impl
+        )
+        self._pmu_counters = kwargs.get("pmu_counters", [])
+
+        assert self._module.file_type == "shlib", "only support shlib for evaluation"
+
+    @override
+    def evaluate(self) -> tuple[list[float], int, str]:
+        assert self._module._bare_ptr, "bare_ptr is not supported for evaluation"
+
+        # Initialize the device and load the module
+        self._device.init_device()
+        self._device.load_module(self._module)
+        sym = self._module.payload_name
+        func = self._device.get_module_function(self._module, sym)
+        results: tuple[list[float], int, str] = ([], 0, "")
+        validation_failed = False
+
+        # Prepare the parameters
+        parameters = ensure_ndarray_parameters(
+            self._parameters,
+            self._np_inputs_spec,
+            self._np_outputs_spec,
+            self._init_zero,
+        )
+
+        # Map the buffers
+        # TODO Replace memory mapping of buffers by explicit transfers
+        for i, buffer in enumerate(parameters[0]):
+            if self._np_inputs_spec()[i]["device"] is None:
+                self._device._register_buffer(
+                    buffer.data, buffer.size * buffer.dtype.itemsize
+                )
+        for i, buffer in enumerate(parameters[1]):
+            if self._np_outputs_spec()[i]["device"] is None:
+                self._device._register_buffer(
+                    buffer.data, buffer.size * buffer.dtype.itemsize
+                )
+
+        # Check the correctness of the outputs
+        if self._validate:
+            results = validate_outputs(func, parameters, self._reference_impl)
+            validation_failed = results[1] != 0
+
+        # Measure the performance
+        if not validation_failed:
+            results = evaluate_performance(
+                func,
+                parameters,
+                self._pmu_counters,
+                self._repeat,
+                self._number,
+                self._min_repeat_ms,
+                self._device,
+            )
+
+        # Unmap the buffers
+        for i, buffer in enumerate(parameters[0]):
+            if self._np_inputs_spec()[i]["device"] is None:
+                self._device._unregister_buffer(buffer.data)
+        for i, buffer in enumerate(parameters[1]):
+            if self._np_outputs_spec()[i]["device"] is None:
+                self._device._unregister_buffer(buffer.data)
+
+        # Unload the module
+        self._device.unload_module(self._module)
+
+        # Copy out outputs
+        if self._parameters is not None:
+            copy_outputs(parameters, self._parameters)
+
+        return results
+
+    @property
+    @override
+    def module(self) -> itf.comp.Module:
+        return self._module
+
+
+class GPUExecutor(itf.exec.Executor):
+    def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None:
+        self._evaluator = GPUEvaluator(
+            module=module,
+            repeat=1,
+            min_repeat_ms=0,
+            number=1,
+            **kwargs,
+        )
+
+    @override
+    def execute(self) -> int:
+        results, code, err_msg = self._evaluator.evaluate()
+        return code
+
+    @property
+    @override
+    def module(self) -> itf.comp.Module:
+        return self._evaluator.module
diff --git a/src/xtc/targets/accelerator/gpu/GPUModule.py b/src/xtc/targets/accelerator/gpu/GPUModule.py
new file mode 100644
index 000000000..9fd27f3c7
--- /dev/null
+++ b/src/xtc/targets/accelerator/gpu/GPUModule.py
@@ -0,0 +1,89 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing import Any
+from typing_extensions import override
+
+import xtc.itf as itf
+from xtc.itf.graph import Graph
+from xtc.utils.evaluation import (
+    graph_np_inputs_spec,
+    graph_np_outputs_spec,
+    graph_reference_impl,
+)
+from .GPUEvaluator import GPUExecutor, GPUEvaluator
+
+
+__all__ = [
+    "GPUModule",
+]
+
+
+class GPUModule(itf.comp.Module):
+    def __init__(
+        self,
+        name: str,
+        payload_name: str,
+        file_name: str,
+        file_type: str,
+        graph: Graph | None = None,
+        **kwargs: Any,
+    ) -> None:
+        self._name = name
+        self._payload_name = payload_name
+        self._file_name = file_name
+        self._file_type = file_type
+        assert self._file_type == "shlib", "only support shlib for JIR Module"
+        lib_suffixes = ("so", "dylib")
+        assert self._file_name.endswith(lib_suffixes), (
+            f"file name {self._file_name} is not a shlib"
+        )
+        self._bare_ptr = kwargs.get("bare_ptr", True)
+        self._graph = graph
+        if self._graph is not None:
+            self._np_inputs_spec = graph_np_inputs_spec(self._graph)
+            self._np_outputs_spec = graph_np_outputs_spec(self._graph)
+            self._reference_impl = graph_reference_impl(self._graph)
+        else:
+            self._np_inputs_spec = kwargs.get("np_inputs_spec")
+            self._np_outputs_spec = kwargs.get("np_outputs_spec")
+            self._reference_impl = kwargs.get("reference_impl")
+
+    @property
+    @override
+    def file_type(self) -> str:
+        return self._file_type
+
+    @property
+    @override
+    def name(self) -> str:
+        return self._name
+
+    @property
+    @override
+    def payload_name(self) -> str:
+        return self._payload_name
+
+    @property
+    @override
+    def file_name(self) -> str:
+        return self._file_name
+
+    @override
+    def export(self) -> None:
+        raise NotImplementedError("GPUModule.export is not implemented")
+
+    @override
+    def get_evaluator(self, **kwargs: Any) -> itf.exec.Evaluator:
+        return GPUEvaluator(
+            self,
+            **kwargs,
+        )
+
+    @override
+    def get_executor(self, **kwargs: Any) -> itf.exec.Executor:
+        return GPUExecutor(
+            self,
+            **kwargs,
+        )
diff --git a/src/xtc/targets/gpu/__init__.py b/src/xtc/targets/accelerator/gpu/__init__.py
similarity index 100%
rename from src/xtc/targets/gpu/__init__.py
rename to src/xtc/targets/accelerator/gpu/__init__.py
diff --git a/src/xtc/targets/accelerator/mppa/MppaEvaluator.py b/src/xtc/targets/accelerator/mppa/MppaEvaluator.py
new file mode 100644
index 000000000..9baf22042
--- /dev/null
+++ b/src/xtc/targets/accelerator/mppa/MppaEvaluator.py
@@ -0,0 +1,117 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing import Any
+from typing_extensions import override
+
+import xtc.targets.accelerator.mppa as mppa
+import xtc.itf as itf
+from xtc.runtimes.accelerator.mppa import MppaDevice
+from xtc.utils.evaluation import (
+    ensure_ndarray_parameters,
+    validate_outputs,
+    evaluate_performance,
+    copy_outputs,
+)
+
+__all__ = [
+    "MppaEvaluator",
+    "MppaExecutor",
+]
+
+
+class MppaEvaluator(itf.exec.Evaluator):
+    def __init__(self, module: "mppa.MppaModule", **kwargs: Any) -> None:
+        self._device = MppaDevice(module._mppa_config)
+        self._module = module
+        self._repeat = kwargs.get("repeat", 1)
+        self._min_repeat_ms = kwargs.get("min_repeat_ms", 100)
+        self._number = kwargs.get("number", 1)
+        self._validate = kwargs.get("validate", False)
+        self._parameters = kwargs.get("parameters")
+        self._init_zero = kwargs.get("init_zero", False)
+        self._np_inputs_spec = kwargs.get(
+            "np_inputs_spec", self._module._np_inputs_spec
+        )
+        self._np_outputs_spec = kwargs.get(
+            "np_outputs_spec", self._module._np_outputs_spec
+        )
+        self._reference_impl = kwargs.get(
+            "reference_impl", self._module._reference_impl
+        )
+        self._pmu_counters = kwargs.get("pmu_counters", [])
+
+        assert self._module.file_type == "shlib", "only support shlib for evaluation"
+
+    @override
+    def evaluate(self) -> tuple[list[float], int, str]:
+        assert self._module._bare_ptr, "bare_ptr is not supported for evaluation"
+
+        # Initialize the device and load the module
+        self._device.init_device()
+        self._device.load_module(self._module)
+        sym = self._module.payload_name
+        func = self._device.get_module_function(self._module, sym)
+        results: tuple[list[float], int, str] = ([], 0, "")
+        validation_failed = False
+
+        # Prepare the parameters
+        parameters = ensure_ndarray_parameters(
+            self._parameters,
+            self._np_inputs_spec,
+            self._np_outputs_spec,
+            self._init_zero,
+        )
+
+        # Check the correctness of the outputs
+        if self._validate:
+            results = validate_outputs(func, parameters, self._reference_impl)
+            validation_failed = results[1] != 0
+
+        # Measure the performance
+        if not validation_failed:
+            results = evaluate_performance(
+                func,
+                parameters,
+                self._pmu_counters,
+                self._repeat,
+                self._number,
+                self._min_repeat_ms,
+                self._device,
+            )
+
+        # Unload the module
+        self._device.unload_module(self._module)
+
+        # Copy out outputs
+        if self._parameters is not None:
+            copy_outputs(parameters, self._parameters)
+
+        return results
+
+    @property
+    @override
+    def module(self) -> itf.comp.Module:
+        return self._module
+
+
+class MppaExecutor(itf.exec.Executor):
+    def __init__(self, module: "mppa.MppaModule", **kwargs: Any) -> None:
+        self._evaluator = MppaEvaluator(
+            module=module,
+            repeat=1,
+            min_repeat_ms=0,
+            number=1,
+            **kwargs,
+        )
+
+    @override
+    def execute(self) -> int:
+        results, code, err_msg = self._evaluator.evaluate()
+        return code
+
+    @property
+    @override
+    def module(self) -> itf.comp.Module:
+        return self._evaluator.module
diff --git a/src/xtc/targets/accelerator/mppa/MppaModule.py b/src/xtc/targets/accelerator/mppa/MppaModule.py
new file mode 100644
index 000000000..746329e16
--- /dev/null
+++ b/src/xtc/targets/accelerator/mppa/MppaModule.py
@@ -0,0 +1,94 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing import Any, cast
+from typing_extensions import override
+
+from xtc.itf.graph import Graph
+import xtc.itf as itf
+
+from .MppaEvaluator import MppaEvaluator, MppaExecutor
+from xtc.runtimes.accelerator.mppa import MppaConfig
+
+from xtc.utils.evaluation import (
+    graph_np_inputs_spec,
+    graph_np_outputs_spec,
+    graph_reference_impl,
+)
+
+__all__ = [
+    "MppaModule",
+]
+
+
+class MppaModule(itf.comp.Module):
+    def __init__(
+        self,
+        name: str,
+        payload_name: str,
+        file_name: str,
+        file_type: str,
+        mppa_config: MppaConfig,
+        graph: Graph | None = None,
+        **kwargs: Any,
+    ) -> None:
+        self._name = name
+        self._payload_name = payload_name
+        self._file_name = file_name
+        self._file_type = file_type
+        assert self._file_type == "shlib", "only support shlib for JIR Module"
+        lib_suffixes = ("so", "dylib")
+        assert self._file_name.endswith(lib_suffixes), (
+            f"file name {self._file_name} is not a shlib"
+        )
+        self._bare_ptr = kwargs.get("bare_ptr", True)
+        self._graph = graph
+        if self._graph is not None:
+            self._np_inputs_spec = graph_np_inputs_spec(self._graph)
+            self._np_outputs_spec = graph_np_outputs_spec(self._graph)
+            self._reference_impl = graph_reference_impl(self._graph)
+        else:
+            self._np_inputs_spec = kwargs.get("np_inputs_spec")
+            self._np_outputs_spec = kwargs.get("np_outputs_spec")
+            self._reference_impl = kwargs.get("reference_impl")
+        self._mppa_config = mppa_config  # FIXME: remove passing config to module
+        # TODO Handle shlib of multiple kernels on fly
+
+    @property
+    @override
+    def file_type(self) -> str:
+        return self._file_type
+
+    @property
+    @override
+    def name(self) -> str:
+        return self._name
+
+    @property
+    @override
+    def payload_name(self) -> str:
+        return self._payload_name
+
+    @property
+    @override
+    def file_name(self) -> str:
+        return self._file_name
+
+    @override
+    def export(self) -> None:
+        raise NotImplementedError("AcceleratorModule.export is not implemented")
+
+    @override
+    def get_evaluator(self, **kwargs: Any) -> itf.exec.Evaluator:
+        return MppaEvaluator(
+            self,
+            **kwargs,
+        )
+
+    @override
+    def get_executor(self, **kwargs: Any) -> itf.exec.Executor:
+        return MppaExecutor(
+            self,
+            **kwargs,
+        )
diff --git a/src/xtc/targets/accelerator/mppa/__init__.py b/src/xtc/targets/accelerator/mppa/__init__.py
new file mode 100644
index 000000000..5ab2c51d8
--- /dev/null
+++ b/src/xtc/targets/accelerator/mppa/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from .MppaModule import MppaModule
+from .MppaEvaluator import MppaEvaluator, MppaExecutor
diff --git a/src/xtc/targets/gpu/GPUEvaluator.py b/src/xtc/targets/gpu/GPUEvaluator.py
deleted file mode 100644
index 1017d4063..000000000
--- a/src/xtc/targets/gpu/GPUEvaluator.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2024-2026 The XTC Project Authors
-#
-from typing import Any
-from typing_extensions import override
-import numpy as np
-import ctypes
-
-from xtc.runtimes.types.ndarray import NDArray
-import xtc.runtimes.gpu.runtime as gpu_runtime
-from xtc.utils.numpy import (
-    np_init,
-)
-from xtc.utils.tools import get_mlir_prefix
-from xtc.utils.loader import LibLoader
-from xtc.utils.ext_tools import cuda_runtime_lib
-
-import xtc.itf as itf
-import xtc.targets.gpu as gpu
-from xtc.targets.host import HostEvaluator, HostExecutor
-
-
-__all__ = [
-    "GPUEvaluator",
-    "GPUExecutor",
-]
-
-
-class GPUEvaluator(HostEvaluator):
-    def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None:
-        self._runtime_lib = LibLoader(f"{get_mlir_prefix()}/lib/{cuda_runtime_lib}")
-        kwargs["register_buffer_fn"] = self._register_buffer
-        kwargs["unregister_buffer_fn"] = self._unregister_buffer
-        kwargs["runtime"] = gpu_runtime
-        super().__init__(module, **kwargs)
-
-    def __exit(self, exc_type, exc_value, traceback) -> None:
-        runtime_lib.close()
-
-    def _register_buffer(self, buffer: NDArray) -> None:
-        nb_bytes = buffer.size * buffer.dtype.itemsize
-        nb_bytes_c = ctypes.c_int64(nb_bytes)
-        buffer_ptr = ctypes.cast(buffer.data, ctypes.c_void_p)
-        func_name = "mgpuMemHostRegister"
-        func = getattr(self._runtime_lib.lib, func_name)
-        assert func is not None, (
-            f"Cannot find symbol {func_name} in lib {self._runtime_lib.lib}"
-        )
-        func(buffer_ptr, nb_bytes_c)
-
-    def _unregister_buffer(self, buffer: NDArray) -> None:
-        buffer_ptr = ctypes.cast(buffer.data, ctypes.c_void_p)
-        func_name = "mgpuMemHostUnregister"
-        func = getattr(self._runtime_lib.lib, func_name)
-        assert func is not None, (
-            f"Cannot find symbol {func_name} in lib {self._runtime_lib.lib}"
-        )
-        func(buffer_ptr)
-
-
-class GPUExecutor(HostExecutor):
-    def __init__(self, module: "gpu.GPUModule", **kwargs: Any) -> None:
-        self._evaluator = GPUEvaluator(
-            module=module,
-            repeat=1,
-            min_repeat_ms=0,
-            number=1,
-            **kwargs,
-        )
diff --git a/src/xtc/targets/gpu/GPUModule.py b/src/xtc/targets/gpu/GPUModule.py
deleted file mode 100644
index 8948db05a..000000000
--- a/src/xtc/targets/gpu/GPUModule.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2024-2026 The XTC Project Authors
-#
-from typing import Any, cast
-from typing_extensions import override
-
-import xtc.itf as itf
-from xtc.itf.graph import Graph
-from xtc.graphs.xtc.graph import XTCGraph
-from xtc.graphs.xtc.data import XTCTensor
-from xtc.graphs.xtc.expr import XTCTensorExpr
-
-from .GPUEvaluator import GPUExecutor, GPUEvaluator
-from xtc.targets.host import HostModule
-
-
-__all__ = [
-    "GPUModule",
-]
-
-
-class GPUModule(HostModule):
-    def __init__(
-        self,
-        name: str,
-        payload_name: str,
-        file_name: str,
-        file_type: str,
-        graph: Graph | None = None,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(name, payload_name, file_name, file_type, graph, **kwargs)
-
-    @override
-    def get_evaluator(self, **kwargs: Any) -> itf.exec.Evaluator:
-        return GPUEvaluator(
-            self,
-            **kwargs,
-        )
-
-    @override
-    def get_executor(self, **kwargs: Any) -> itf.exec.Executor:
-        return GPUExecutor(
-            self,
-            **kwargs,
-        )
diff --git a/src/xtc/targets/host/HostEvaluator.py b/src/xtc/targets/host/HostEvaluator.py
index cae767d7a..c0f9b1af7 100644
--- a/src/xtc/targets/host/HostEvaluator.py
+++ b/src/xtc/targets/host/HostEvaluator.py
@@ -5,16 +5,25 @@
 from typing import Any
 from typing_extensions import override
 import numpy as np
+from pathlib import Path
 
 from xtc.runtimes.types.ndarray import NDArray
 from xtc.utils.numpy import (
     np_init,
 )
-from xtc.runtimes.host.evaluate import load_and_evaluate
+
+from xtc.runtimes.host.HostRuntime import HostRuntime
 
 import xtc.itf as itf
 import xtc.targets.host as host
 
+from xtc.utils.loader import LibLoader
+from xtc.utils.evaluation import (
+    ensure_ndarray_parameters,
+    validate_outputs,
+    evaluate_performance,
+    copy_outputs,
+)
 
 __all__ = [
     "HostEvaluator",
@@ -40,78 +49,53 @@ def __init__(self, module: "host.HostModule", **kwargs: Any) -> None:
         self._reference_impl = kwargs.get(
             "reference_impl", self._module._reference_impl
         )
-        self._register_buffer_fn = kwargs.get("register_buffer_fn", None)
-        self._unregister_buffer_fn = kwargs.get("unregister_buffer_fn", None)
         self._pmu_counters = kwargs.get("pmu_counters", [])
-        self._runtime = kwargs.get("runtime", None)
-        if self._runtime is None:
-            import xtc.runtimes.host.runtime as host_runtime
-
-            self._runtime = host_runtime
+        self._runtime = kwargs.get("runtime", HostRuntime())
         assert self._module.file_type == "shlib", "only support shlib for evaluation"
 
     @override
     def evaluate(self) -> tuple[list[float], int, str]:
-        if self._parameters is None:
-            assert self._np_inputs_spec is not None
-            assert self._np_outputs_spec is not None
-            inputs_spec = self._np_inputs_spec()
-            outputs_spec = self._np_outputs_spec()
-            out_init = np.zeros if self._init_zero else np.empty
-            inputs = [np_init(**spec) for spec in inputs_spec]
-            outputs = [out_init(**spec) for spec in outputs_spec]
-            parameters = (
-                [NDArray(inp) for inp in inputs],
-                [NDArray(out) for out in outputs],
-            )
-        else:
-            inputs, outputs = self._parameters
-            nd_inputs = [
-                NDArray(inp) if isinstance(inp, np.ndarray) else inp for inp in inputs
-            ]
-            nd_outputs = [
-                NDArray(out) if isinstance(out, np.ndarray) else out for out in outputs
-            ]
-            parameters = (nd_inputs, nd_outputs)
-
-        ref_outputs = []
-        if self._validate:
-            assert self._reference_impl is not None
-            ref_inputs = [inp.numpy() for inp in parameters[0]]
-            ref_outputs = [
-                np.empty(shape=out.shape, dtype=out.dtype) for out in parameters[1]
-            ]
-            self._reference_impl(*ref_inputs, *ref_outputs)
-
-        if self._register_buffer_fn is not None:
-            for buffer in parameters[0] + parameters[1]:
-                self._register_buffer_fn(buffer)
-
-        results = load_and_evaluate(
-            runtime=self._runtime,
-            module_file=self._module.file_name,
-            module_name=self._module.name,
-            payload_name=self._module.payload_name,
-            bare_ptr=self._module._bare_ptr,
-            parameters=parameters,
-            validate=self._validate,
-            ref_outputs=ref_outputs,
-            repeat=self._repeat,
-            min_repeat_ms=self._min_repeat_ms,
-            number=self._number,
-            pmu_counters=self._pmu_counters,
+        # Load the module
+        dll = str(Path(self._module.file_name).absolute())
+        lib = LibLoader(dll)
+        sym = self._module.payload_name
+        func = getattr(lib.lib, sym)
+        func.packed = not self._module._bare_ptr
+        results: tuple[list[float], int, str] = ([], 0, "")
+        validation_failed = False
+
+        # Prepare the parameters
+        parameters = ensure_ndarray_parameters(
+            self._parameters,
+            self._np_inputs_spec,
+            self._np_outputs_spec,
+            self._init_zero,
         )
 
-        if self._unregister_buffer_fn is not None:
-            for buffer in parameters[0] + parameters[1]:
-                self._unregister_buffer_fn(buffer)
+        # Check the correctness of the outputs
+        if self._validate:
+            results = validate_outputs(func, parameters, self._reference_impl)
+            validation_failed = results[1] != 0
+
+        # Measure the performance
+        if not validation_failed:
+            assert self._runtime is not None
+            results = evaluate_performance(
+                func,
+                parameters,
+                self._pmu_counters,
+                self._repeat,
+                self._number,
+                self._min_repeat_ms,
+                self._runtime,
+            )
+
+        # Unload the module
+        lib.close()
 
+        # Copy out outputs
         if self._parameters is not None:
-            _, outputs = self._parameters
-            _, outputs_copy = parameters
-            for out, out_copy in zip(outputs, outputs_copy):
-                if isinstance(out, np.ndarray):
-                    out_copy.numpy(out=out)
+            copy_outputs(parameters, self._parameters)
 
         return results
 
diff --git a/src/xtc/targets/host/HostModule.py b/src/xtc/targets/host/HostModule.py
index a3b1cbd01..bf3c44870 100644
--- a/src/xtc/targets/host/HostModule.py
+++ b/src/xtc/targets/host/HostModule.py
@@ -5,13 +5,13 @@
 from typing import Any, cast
 from typing_extensions import override
 
-import sys
-
 import xtc.itf as itf
 from xtc.itf.graph import Graph
-from xtc.graphs.xtc.graph import XTCGraph
-from xtc.graphs.xtc.data import XTCTensor
-from xtc.graphs.xtc.expr import XTCTensorExpr
+from xtc.utils.evaluation import (
+    graph_np_inputs_spec,
+    graph_np_outputs_spec,
+    graph_reference_impl,
+)
 
 from .HostEvaluator import HostExecutor, HostEvaluator
 
@@ -40,59 +40,17 @@ def __init__(
         assert self._file_name.endswith(lib_suffixes), (
             f"file name {self._file_name} is not a shlib"
         )
-
         self._bare_ptr = kwargs.get("bare_ptr", True)
         self._graph = graph
         if self._graph is not None:
-            self._np_inputs_spec = self._graph_np_inputs_spec
-            self._np_outputs_spec = self._graph_np_outputs_spec
-            self._reference_impl = self._graph_reference_impl
+            self._np_inputs_spec = graph_np_inputs_spec(self._graph)
+            self._np_outputs_spec = graph_np_outputs_spec(self._graph)
+            self._reference_impl = graph_reference_impl(self._graph)
         else:
             self._np_inputs_spec = kwargs.get("np_inputs_spec")
             self._np_outputs_spec = kwargs.get("np_outputs_spec")
             self._reference_impl = kwargs.get("reference_impl")
 
-    def _graph_np_inputs_spec(self) -> list[dict[str, Any]]:
-        assert isinstance(self._graph, XTCGraph)
-        assert all(
-            [
-                isinstance(node._expr, XTCTensorExpr) and node._expr.type.is_constant()
-                for node in self._graph.inputs_nodes
-            ]
-        ), f"graph inputs are not tensors"
-        inputs_types = [
-            cast(XTCTensorExpr, node._expr).type for node in self._graph.inputs_nodes
-        ]
-        return [
-            {
-                "shape": type.constant_shape,
-                "dtype": type.constant_dtype,
-            }
-            for type in inputs_types
-        ]
-
-    def _graph_np_outputs_spec(self) -> list[dict[str, Any]]:
-        assert isinstance(self._graph, XTCGraph)
-        assert all(
-            [node._outputs_types is not None for node in self._graph.outputs_nodes]
-        ), f"graph types were not forwarded"
-        return [
-            {
-                "shape": type.constant_shape,
-                "dtype": type.constant_dtype,
-            }
-            for type in [
-                cast(list, node._outputs_types)[0] for node in self._graph.outputs_nodes
-            ]
-        ]
-
-    def _graph_reference_impl(self, *args: Any) -> None:
-        assert self._graph is not None
-        inputs = [XTCTensor(inp) for inp in args[: len(self._graph.inputs)]]
-        outputs = self._graph.forward(inputs)
-        for idx, out in enumerate(args[len(self._graph.inputs) :]):
-            out[:] = outputs[idx].numpy()
-
     @property
     @override
     def file_type(self) -> str:
diff --git a/src/xtc/utils/cfunc.py b/src/xtc/utils/cfunc.py
new file mode 100644
index 000000000..bc5ee41be
--- /dev/null
+++ b/src/xtc/utils/cfunc.py
@@ -0,0 +1,121 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing import Any
+import ctypes
+
+__all__ = [
+    "CFunc",
+    "CArgValue",
+    "CArgCode",
+    "CRetValue",
+    "CPackedFunc",
+    "_c_ascii_str",
+    "_str_list_to_c",
+]
+
+
+class ArgTypeCode:
+    INT = 0
+    HANDLE = 3
+    NDARRAY_HANDLE = 13
+
+
+CArgCode = ctypes.c_int
+
+
+class CArgValue(ctypes.Union):
+    _fields_ = [
+        ("v_int64", ctypes.c_int64),
+        ("v_float64", ctypes.c_double),
+        ("v_handle", ctypes.c_void_p),
+        ("v_str", ctypes.c_char_p),
+    ]
+
+
+class CRetValue(CArgValue):
+    pass
+
+
+CPackedFunc = ctypes.CFUNCTYPE(
+    ctypes.c_int,
+    ctypes.POINTER(CArgValue),
+    ctypes.POINTER(CArgCode),
+    ctypes.c_int,
+    ctypes.POINTER(CRetValue),
+    ctypes.POINTER(CArgCode),
+)
+
+
+class CFunc:
+    def __init__(self, f: Any, packed: bool = False) -> None:
+        self.handle = f
+        self.is_packed = packed or (
+            hasattr(self.handle, "packed") and self.handle.packed
+        )
+
+    def arg_tuple(self, arg: Any) -> Any:
+        if arg.__class__.__name__ == "ndarray":  # Numpy Array
+            assert not self.is_packed
+            return (arg.ctypes.data_as(ctypes.c_voidp), ArgTypeCode.HANDLE)
+        elif arg.__class__.__name__ == "NDArray":  # TVM NDArray or our NDArray
+            if (
+                hasattr(arg, "is_on_device") and arg.is_on_device()
+            ):  # Device living NDArray
+                if self.is_packed:
+                    raise RuntimeError("TODO: device NDArray not supported yet")
+                else:
+                    return (
+                        ctypes.cast(arg.data, ctypes.c_void_p),
+                        ArgTypeCode.HANDLE,
+                    )
+            if self.is_packed:
+                return (
+                    CArgValue(v_handle=ctypes.cast(arg.handle, ctypes.c_void_p)),
+                    ArgTypeCode.NDARRAY_HANDLE,
+                )
+            else:
+                return (
+                    ctypes.cast(arg.handle.contents.dl_tensor.data, ctypes.c_void_p),
+                    ArgTypeCode.HANDLE,
+                )
+        else:
+            assert 0, f"Unsupported argument class: {arg.__class__.__name__}"
+
+    def args_tuples(self, args: Any) -> list[Any]:
+        return [self.arg_tuple(arg) for arg in args]
+
+    def __call__(self, *args: Any):
+        args_tuples = self.args_tuples(args)
+        if self.is_packed:
+            args_array = (CArgValue * len(args_tuples))(
+                *[arg[0] for arg in args_tuples]
+            )
+            args_codes = (CArgCode * len(args_tuples))(*[arg[1] for arg in args_tuples])
+            result_val = CRetValue(0)
+            result_code = CArgCode(ArgTypeCode.INT)
+            res = CPackedFunc(self.handle)(
+                args_array,
+                args_codes,
+                len(args_tuples),
+                ctypes.byref(result_val),
+                ctypes.byref(result_code),
+                ctypes.c_int(len(args_tuples)),
+            )
+            assert res == 0, f"error calling packed function"
+        else:
+            data_args = [arg[0] for arg in args_tuples]
+            self.handle(*data_args)
+
+
+class _c_ascii_str:
+    @staticmethod
+    def from_param(obj: str | bytes):
+        if isinstance(obj, str):
+            obj = obj.encode("ascii")
+        return ctypes.c_char_p.from_param(obj)
+
+
+def _str_list_to_c(str_list: list[str]) -> Any:
+    return (ctypes.c_char_p * len(str_list))(*[str.encode("utf-8") for str in str_list])
diff --git a/src/xtc/utils/evaluation.py b/src/xtc/utils/evaluation.py
new file mode 100644
index 000000000..e5deba670
--- /dev/null
+++ b/src/xtc/utils/evaluation.py
@@ -0,0 +1,194 @@
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024-2026 The XTC Project Authors
+#
+from typing import Any, Callable, cast
+from xtc.itf.graph import Graph
+import ctypes
+import numpy as np
+from xtc.utils.numpy import np_init
+from xtc.runtimes.types.ndarray import NDArray
+from xtc.graphs.xtc.graph import XTCGraph
+from xtc.graphs.xtc.expr import XTCTensorExpr
+from xtc.graphs.xtc.data import XTCTensor
+from xtc.utils.cfunc import CFunc, CArgValue, CArgCode
+from xtc.itf.runtime.common import CommonRuntimeInterface
+from xtc.runtimes.host.HostRuntime import HostRuntime
+
+__all__ = []
+
+
+def graph_np_inputs_spec(graph: Graph) -> Callable[[], list[dict[str, Any]]]:
+    assert isinstance(graph, XTCGraph)
+    assert all(
+        [
+            isinstance(node._expr, XTCTensorExpr) and node._expr.type.is_constant()
+            for node in graph.inputs_nodes
+        ]
+    ), f"graph inputs are not tensors"
+
+    def _graph_np_inputs_spec() -> list[dict[str, Any]]:
+        inputs_types = [
+            cast(XTCTensorExpr, node._expr).type for node in graph.inputs_nodes
+        ]
+        return [
+            {
+                "shape": type.constant_shape,
+                "dtype": type.constant_dtype,
+                "device": type.device,
+            }
+            for type in inputs_types
+        ]
+
+    return _graph_np_inputs_spec
+
+
+def graph_np_outputs_spec(graph: Graph) -> Callable[[], list[dict[str, Any]]]:
+    assert isinstance(graph, XTCGraph)
+    assert all([node._outputs_types is not None for node in graph.outputs_nodes]), (
+        f"graph types were not forwarded"
+    )
+
+    def _graph_np_outputs_spec() -> list[dict[str, Any]]:
+        return [
+            {
+                "shape": type.constant_shape,
+                "dtype": type.constant_dtype,
+                "device": type.device,
+            }
+            for type in [
+                cast(list, node._outputs_types)[0] for node in graph.outputs_nodes
+            ]
+        ]
+
+    return _graph_np_outputs_spec
+
+
+def graph_reference_impl(graph: Graph) -> Callable[[], None]:
+    def _graph_reference_impl(*args: Any) -> None:
+        inputs = [XTCTensor(inp) for inp in args[: len(graph.inputs)]]
+        outputs = graph.forward(inputs)
+        for idx, out in enumerate(args[len(graph.inputs) :]):
+            out[:] = outputs[idx].numpy()
+
+    return _graph_reference_impl
+
+
+def ensure_ndarray_parameters(
+    parameters: tuple[Any, Any] | None,
+    np_inputs_spec: Callable[[], list[dict[str, Any]]] | None,
+    np_outputs_spec: Callable[[], list[dict[str, Any]]] | None,
+    init_zero: bool = False,
+) -> tuple[list[NDArray], list[NDArray]]:
+    if parameters is None:
+        assert np_inputs_spec is not None
+        assert np_outputs_spec is not None
+        inputs_spec = np_inputs_spec()
+        outputs_spec = np_outputs_spec()
+        out_init = np.zeros if init_zero else np.empty
+        inputs = [
+            (np_init(**spec), spec["device"] if "device" in spec else HostRuntime.get())
+            for spec in inputs_spec
+        ]
+        outputs = [
+            out_init(**{k: v for k, v in spec.items() if k != "device"})
+            for spec in outputs_spec
+        ]
+        parameters = (
+            [NDArray(*inp) for inp in inputs],
+            [
+                NDArray(
+                    out,
+                    runtime=spec["device"] if "device" in spec else HostRuntime.get(),
+                )
+                for out, spec in zip(outputs, outputs_spec)
+            ],
+        )
+    else:
+        inputs, outputs = parameters
+        nd_inputs = [
+            NDArray(inp) if isinstance(inp, np.ndarray) else inp for inp in inputs
+        ]
+        nd_outputs = [
+            NDArray(out) if isinstance(out, np.ndarray) else out for out in outputs
+        ]
+        parameters = (nd_inputs, nd_outputs)
+    return parameters
+
+
+def validate_outputs(
+    func: Callable[[Any], Any],
+    parameters: tuple[list[NDArray], list[NDArray]],
+    reference_impl: Callable[[], None],
+) -> tuple[list[float], int, str]:
+    # Get the reference outputs
+    assert reference_impl is not None
+    ref_inputs = [inp.numpy() for inp in parameters[0]]
+    ref_outputs = [np.empty(shape=out.shape, dtype=out.dtype) for out in parameters[1]]
+    reference_impl(*ref_inputs, *ref_outputs)
+    # Get the function outputs
+    CFunc(func)(*parameters[0], *parameters[1])
+    # Compare
+    for out_ref, out in zip(ref_outputs, [out.numpy() for out in parameters[1]]):
+        if not np.allclose(out_ref, out):
+            return ([], 1, "Error in validation: outputs differ")
+    return ([], 0, "")
+
+
+def evaluate_performance(
+    func: Callable[[Any], Any],
+    parameters: tuple[list[NDArray], list[NDArray]],
+    pmu_counters: list[str],
+    repeat: int,
+    number: int,
+    min_repeat_ms: int,
+    runtime: CommonRuntimeInterface | Any,
+) -> tuple[list[float], int, str]:
+    # TODO migrate host runtime to CommonRuntimeInterface
+    cfunc = CFunc(func)
+    args_tuples = cfunc.args_tuples([*parameters[0], *parameters[1]])
+    values_num = 1
+    if len(pmu_counters) > 0:
+        values_num = len(pmu_counters)
+        # FIXME check if the PMU counters are supported by the target
+    results_array = (ctypes.c_double * (repeat * values_num))()
+    if cfunc.is_packed:
+        args_array_packed = (CArgValue * len(args_tuples))(
+            *[arg[0] for arg in args_tuples]
+        )
+        args_codes_packed = (CArgCode * len(args_tuples))(
+            *[arg[1] for arg in args_tuples]
+        )
+        runtime.evaluate_packed_perf(
+            results_array,
+            pmu_counters,
+            repeat,
+            number,
+            min_repeat_ms,
+            cfunc,
+            args_array_packed,
+            args_codes_packed,
+            len(args_tuples),
+        )
+        eval_results = [float(x) for x in results_array]
+    else:
+        eval_results = runtime.evaluate_perf(
+            pmu_counters,
+            repeat,
+            number,
+            min_repeat_ms,
+            cfunc,
+            args_tuples,
+        )
+    return (eval_results, 0, "")
+
+
+def copy_outputs(
+    parameters: tuple[list[NDArray], list[NDArray]],
+    target_parameters: tuple[list[NDArray], list[NDArray]],
+) -> None:
+    _, outputs = target_parameters
+    _, outputs_copy = parameters
+    for out, out_copy in zip(outputs, outputs_copy):
+        if isinstance(out, np.ndarray):
+            out_copy.numpy(out=out)
diff --git a/src/xtc/utils/numpy.py b/src/xtc/utils/numpy.py
index 500b6483d..5e85a978b 100644
--- a/src/xtc/utils/numpy.py
+++ b/src/xtc/utils/numpy.py
@@ -9,7 +9,7 @@
 from .math import mulall
 
 
-def np_init(shape: tuple, dtype: str) -> numpy.typing.NDArray[Any]:
+def np_init(shape: tuple, dtype: str, **attrs: Any) -> numpy.typing.NDArray[Any]:
     """
     Initialize and return a NP array filled
     with numbers in [1, 9].
diff --git a/tests/filecheck/backends/target_gpu/test_matmul_mlir_offload_tensor.py b/tests/filecheck/backends/target_gpu/test_matmul_mlir_offload_tensor.py
new file mode 100644
index 000000000..c28c81342
--- /dev/null
+++ b/tests/filecheck/backends/target_gpu/test_matmul_mlir_offload_tensor.py
@@ -0,0 +1,161 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend
+
+from xtc.runtimes.accelerator.gpu import GPUDevice
+
+# Create device
+gpu = GPUDevice()
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A") # A live son the host
+b = O.tensor((K, J), dtype, name="B", device=gpu) # B lives on the accelerator
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C", device=gpu) # C lives on the accelerator
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph)
+
+sch = impl.get_scheduler()
+sch.tile("i", {"i1": 2})
+sch.tile("j", {"j1": 16})
+sch.unroll({"i1": 2})
+sch.parallelize(["i"])
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    target=gpu,
+    shared_lib=True,
+    dump_file="gpu_matmul_mlir_offload_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK:       // -----// IR Dump Before transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x32xf32> {llvm.noalias, memref.on_device}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
+# CHECK-NEXT:      linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>)
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:      transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_op, %forall_op = transform.structured.tile_using_forall %1 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %forall_op "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./i1" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./j1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_7 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  // -----// IR Dump After transform //----- //
+# CHECK-NEXT:  #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x32xf32> {llvm.noalias, memref.on_device}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c4 = arith.constant 4 : index
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
+# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:        %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:        %c32 = arith.constant 32 : index
+# CHECK-NEXT:        %c1_1 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_0 to %c32 step %c1_1 {
+# CHECK-NEXT:          %subview_2 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      scf.forall (%arg3) in (2) {
+# CHECK-NEXT:        %0 = affine.apply #map(%arg3)
+# CHECK-NEXT:        %subview = memref.subview %arg0[%0, 0] [2, 512] [1, 1] : memref<4x512xf32> to memref<2x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:        %subview_0 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[%0, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        %c32 = arith.constant 32 : index
+# CHECK-NEXT:        %c16 = arith.constant 16 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c32 step %c16 {
+# CHECK-NEXT:          %subview_3 = memref.subview %subview[0, 0] [2, 512] [1, 1] : memref<2x512xf32, strided<[512, 1], offset: ?>> to memref<2x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:          %subview_4 = memref.subview %subview_0[0, %arg4] [512, 16] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %subview_5 = memref.subview %subview_1[0, %arg4] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:          %c512 = arith.constant 512 : index
+# CHECK-NEXT:          %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_6 to %c512 step %c1_7 {
+# CHECK-NEXT:            %subview_8 = memref.subview %subview_3[0, %arg5] [2, 1] [1, 1] : memref<2x512xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_9 = memref.subview %subview_4[%arg5, 0] [1, 16] [1, 1] : memref<512x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_10 = memref.subview %subview_5[0, 0] [2, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:            %c2 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:            %c2_13 = arith.constant 2 : index
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_8[%c0_11, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_9[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_10[%c0_11, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_18 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_17 to %c16_18 step %c1_19 {
+# CHECK-NEXT:              %subview_27 = memref.subview %subview_14[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_15[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_16[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            } {"./j1"}
+# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:            %1 = arith.muli %c1_12, %c1_20 : index
+# CHECK-NEXT:            %2 = arith.addi %c0_11, %1 : index
+# CHECK-NEXT:            %subview_21 = memref.subview %subview_8[%2, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_22 = memref.subview %subview_9[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_23 = memref.subview %subview_10[%2, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_25 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_24 to %c16_25 step %c1_26 {
+# CHECK-NEXT:              %subview_27 = memref.subview %subview_21[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_22[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_23[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            } {"./j1"}
+# CHECK-NEXT:          } {"./k"}
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  graph:
+# CHECK-NEXT:    name: matmul
+# CHECK-NEXT:    inputs:
+# CHECK-NEXT:    - %0 : 4x512xfloat32
+# CHECK-NEXT:    - %1 : 512x32xfloat32
+# CHECK-NEXT:    outputs:
+# CHECK-NEXT:    - %2 : 4x32xfloat32
+# CHECK-NEXT:    nodes:
+# CHECK-NEXT:    - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT:  CODE: 0
diff --git a/tests/filecheck/backends/target_mppa/test_matmul_mlir_micro_kernel.py b/tests/filecheck/backends/target_mppa/test_matmul_mlir_micro_kernel.py
new file mode 100644
index 000000000..665867b16
--- /dev/null
+++ b/tests/filecheck/backends/target_mppa/test_matmul_mlir_micro_kernel.py
@@ -0,0 +1,244 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir_mppa
+# REQUIRES: mlir-target=mppa
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend
+
+from xtc.runtimes.accelerator.mppa import MppaDevice
+
+I, J, K, dtype = 16, 16, 64, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph)
+
+sch = impl.get_scheduler()
+sch.define_memory_mesh(axes={"mx": 1, "my": 1})
+sch.define_processor_mesh(axes={"px": 1, "py": 1, "psx": 1, "psy": 1})
+sch.tile("i", {"i1": 8})
+sch.tile("j", {"j1": 8})
+sch.interchange(["i", "j", "i1", "j1", "k"])
+sch.vectorize(["i1", "j1", "k"])
+#sch.pack_at("i1", 1)
+sched = sch.schedule()
+
+# Create mppa device
+mppa = MppaDevice()
+
+comp = impl.get_compiler(
+    target=mppa,
+    shared_lib=True,
+    dump_file="matmul_mlir_mppa",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_lowered_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK:       // -----// IR Dump Before transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<16x64xf32> {llvm.noalias}, %arg1: memref<64x16xf32> {llvm.noalias}, %arg2: memref<16x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<16x16xf32>)
+# CHECK-NEXT:      linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<16x64xf32>, memref<64x16xf32>) outs(%arg2 : memref<16x16xf32>)
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:      transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:      %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=1, "my"=1]> : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=1, "py"=1, "psx"=1, "psy"=1]> from "memory_mesh" : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [8, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:      transform.annotate %tiled_linalg_op_4 "xtc.request_vectorization" : !transform.any_op
+# CHECK-NEXT:      %4 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %4 {
+# CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
+# CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
+# CHECK-NEXT:      } : !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %4 {
+# CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
+# CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
+# CHECK-NEXT:      } : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  // -----// IR Dump After transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    sdist.processor_mesh @processor_mesh from @memory_mesh = <["px"=1, "py"=1, "psx"=1, "psy"=1]>
+# CHECK-NEXT:    sdist.memory_mesh @memory_mesh = <["mx"=1, "my"=1]>
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<16x64xf32> {llvm.noalias}, %arg1: memref<64x16xf32> {llvm.noalias}, %arg2: memref<16x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:      %c8 = arith.constant 8 : index
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c16 = arith.constant 16 : index
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c1 {
+# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        scf.for %arg4 = %c0 to %c16 step %c1 {
+# CHECK-NEXT:          %subview_0 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_0 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c8 {
+# CHECK-NEXT:        %subview = memref.subview %arg0[%arg3, 0] [8, 64] [1, 1] : memref<16x64xf32> to memref<8x64xf32, strided<[64, 1], offset: ?>>
+# CHECK-NEXT:        %subview_0 = memref.subview %arg1[0, 0] [64, 16] [1, 1] : memref<64x16xf32> to memref<64x16xf32, strided<[16, 1]>>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[%arg3, 0] [8, 16] [1, 1] : memref<16x16xf32> to memref<8x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        scf.for %arg4 = %c0 to %c16 step %c8 {
+# CHECK-NEXT:          %subview_2 = memref.subview %subview_0[0, %arg4] [64, 8] [1, 1] : memref<64x16xf32, strided<[16, 1]>> to memref<64x8xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_3 = memref.subview %subview_1[0, %arg4] [8, 8] [1, 1] : memref<8x16xf32, strided<[16, 1], offset: ?>> to memref<8x8xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.matmul {__xtc_id_C_, xtc.request_vectorization} ins(%subview, %subview_2 : memref<8x64xf32, strided<[64, 1], offset: ?>>, memref<64x8xf32, strided<[16, 1], offset: ?>>) outs(%subview_3 : memref<8x8xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  // -----// IR Dump After MLIR Opt //----- //
+# CHECK-NEXT:  #map = affine_map<(d0, d1, d2) -> (d0, d2)>
+# CHECK-NEXT:  #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+# CHECK-NEXT:  #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+# CHECK-NEXT:  "builtin.module"() ({
+# CHECK-NEXT:    "func.func"() <{arg_attrs = [{llvm.noalias}, {llvm.noalias}, {llvm.noalias}], function_type = (memref<16x64xf32>, memref<64x16xf32>, memref<16x16xf32>) -> (), sym_name = "matmul"}> ({
+# CHECK-NEXT:    ^bb0(%arg0: memref<16x64xf32>, %arg1: memref<64x16xf32>, %arg2: memref<16x16xf32>):
+# CHECK-NEXT:      "mppa.launch"() ({
+# CHECK-NEXT:        "kvxcluster.launch"() ({
+# CHECK-NEXT:        ^bb0(%arg3: index):
+# CHECK-NEXT:          %0 = "arith.constant"() <{value = 1 : index}> : () -> index
+# CHECK-NEXT:          %1 = "arith.constant"() <{value = 16 : index}> : () -> index
+# CHECK-NEXT:          %2 = "arith.constant"() <{value = 0 : index}> : () -> index
+# CHECK-NEXT:          %3 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
+# CHECK-NEXT:          %4 = "arith.constant"() <{value = 8 : index}> : () -> index
+# CHECK-NEXT:          "scf.for"(%2, %1, %0) ({
+# CHECK-NEXT:          ^bb0(%arg9: index):
+# CHECK-NEXT:            %11 = "memref.subview"(%arg2, %arg9) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 16>, static_strides = array<i64: 1, 1>}> : (memref<16x16xf32>, index) -> memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            "scf.for"(%2, %1, %0) ({
+# CHECK-NEXT:            ^bb0(%arg10: index):
+# CHECK-NEXT:              %12 = "memref.subview"(%11, %arg10) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x16xf32, strided<[16, 1], offset: ?>>, index) -> memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:              "linalg.fill"(%3, %12) <{operandSegmentSizes = array<i32: 1, 1>}> ({
+# CHECK-NEXT:              ^bb0(%arg11: f32, %arg12: f32):
+# CHECK-NEXT:                "linalg.yield"(%arg11) : (f32) -> ()
+# CHECK-NEXT:              }) {__xtc_id_C_0_} : (f32, memref<1x1xf32, strided<[16, 1], offset: ?>>) -> ()
+# CHECK-NEXT:              "scf.yield"() : () -> ()
+# CHECK-NEXT:            }) {"./j"} : (index, index, index) -> ()
+# CHECK-NEXT:            "scf.yield"() : () -> ()
+# CHECK-NEXT:          }) {"./i"} : (index, index, index) -> ()
+# CHECK-NEXT:          "scf.for"(%2, %1, %4) ({
+# CHECK-NEXT:          ^bb0(%arg4: index):
+# CHECK-NEXT:            %5 = "memref.subview"(%arg0, %arg4) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 8, 64>, static_strides = array<i64: 1, 1>}> : (memref<16x64xf32>, index) -> memref<8x64xf32, strided<[64, 1], offset: ?>>
+# CHECK-NEXT:            %6 = "memref.subview"(%arg2, %arg4) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 8, 16>, static_strides = array<i64: 1, 1>}> : (memref<16x16xf32>, index) -> memref<8x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            "scf.for"(%2, %1, %4) ({
+# CHECK-NEXT:            ^bb0(%arg5: index):
+# CHECK-NEXT:              %7 = "memref.subview"(%arg1, %arg5) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808>, static_sizes = array<i64: 64, 8>, static_strides = array<i64: 1, 1>}> : (memref<64x16xf32>, index) -> memref<64x8xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:              %8 = "memref.subview"(%6, %arg5) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808>, static_sizes = array<i64: 8, 8>, static_strides = array<i64: 1, 1>}> : (memref<8x16xf32, strided<[16, 1], offset: ?>>, index) -> memref<8x8xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:              "linalg.matmul"(%5, %7, %8) <{indexing_maps = [#map, #map1, #map2], operandSegmentSizes = array<i32: 2, 1>}> ({
+# CHECK-NEXT:              ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
+# CHECK-NEXT:                %9 = "arith.mulf"(%arg6, %arg7) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
+# CHECK-NEXT:                %10 = "arith.addf"(%arg8, %9) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
+# CHECK-NEXT:                "linalg.yield"(%10) : (f32) -> ()
+# CHECK-NEXT:              }) {__xtc_id_C_, xtc.request_vectorization} : (memref<8x64xf32, strided<[64, 1], offset: ?>>, memref<64x8xf32, strided<[16, 1], offset: ?>>, memref<8x8xf32, strided<[16, 1], offset: ?>>) -> ()
+# CHECK-NEXT:              "scf.yield"() : () -> ()
+# CHECK-NEXT:            }) {"./j"} : (index, index, index) -> ()
+# CHECK-NEXT:            "scf.yield"() : () -> ()
+# CHECK-NEXT:          }) {"./i"} : (index, index, index) -> ()
+# CHECK-NEXT:          "kvxcluster.launch_terminator"() : () -> ()
+# CHECK-NEXT:        }) {mask = 1 : i32, nclusters = 1 : i32} : () -> ()
+# CHECK-NEXT:        "kvxcluster.await_all"() : () -> ()
+# CHECK-NEXT:        "mppa.yield"() : () -> ()
+# CHECK-NEXT:      }) {device = 1 : i32} : () -> ()
+# CHECK-NEXT:      "func.return"() : () -> ()
+# CHECK-NEXT:    }) : () -> ()
+# CHECK-NEXT:  }) {transform.with_named_sequence} : () -> ()
+# CHECK-NEXT:  
+# CHECK-NEXT:  // -----// IR Dump After MPPA Opt //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    func.func @kvxcluster_launch_0_kernel_cc_0(%arg0: memref<16x16xf32, 2>, %arg1: memref<16x64xf32, 2>, %arg2: memref<64x16xf32, 2>) attributes {kernel_for_cluster_id = 0 : index} {
+# CHECK-NEXT:      %c64 = arith.constant 64 : index
+# CHECK-NEXT:      %c8 = arith.constant 8 : index
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      %c16 = arith.constant 16 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c1 {
+# CHECK-NEXT:        scf.for %arg4 = %c0 to %c16 step %c1 {
+# CHECK-NEXT:          %0 = arith.muli %arg3, %c16 overflow<nsw> : index
+# CHECK-NEXT:          %1 = arith.addi %0, %arg4 : index
+# CHECK-NEXT:          %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1, 1], strides: [16, 1] : memref<16x16xf32, 2> to memref<1x1xf32, strided<[16, 1], offset: ?>, 2>
+# CHECK-NEXT:          kvxpe.launch %arg5 (npes=1) {
+# CHECK-NEXT:            memref.store %cst, %reinterpret_cast[%c0, %c0] : memref<1x1xf32, strided<[16, 1], offset: ?>, 2>
+# CHECK-NEXT:            kvxpe.launch_terminator
+# CHECK-NEXT:          }
+# CHECK-NEXT:          kvxpe.await_all
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c8 {
+# CHECK-NEXT:        %0 = arith.muli %arg3, %c64 overflow<nsw> : index
+# CHECK-NEXT:        %reinterpret_cast = memref.reinterpret_cast %arg1 to offset: [%0], sizes: [8, 64], strides: [64, 1] : memref<16x64xf32, 2> to memref<8x64xf32, strided<[64, 1], offset: ?>, 2>
+# CHECK-NEXT:        scf.for %arg4 = %c0 to %c16 step %c8 {
+# CHECK-NEXT:          %reinterpret_cast_0 = memref.reinterpret_cast %arg2 to offset: [%arg4], sizes: [64, 8], strides: [16, 1] : memref<64x16xf32, 2> to memref<64x8xf32, strided<[16, 1], offset: ?>, 2>
+# CHECK-NEXT:          %1 = arith.muli %arg3, %c16 overflow<nsw> : index
+# CHECK-NEXT:          %2 = arith.addi %1, %arg4 : index
+# CHECK-NEXT:          %reinterpret_cast_1 = memref.reinterpret_cast %arg0 to offset: [%2], sizes: [8, 8], strides: [16, 1] : memref<16x16xf32, 2> to memref<8x8xf32, strided<[16, 1], offset: ?>, 2>
+# CHECK-NEXT:          kvxpe.launch %arg5 (npes=1) {
+# CHECK-NEXT:            kvxuks.mma_8x8xf32 %reinterpret_cast, %reinterpret_cast_0 -> %reinterpret_cast_1 : memref<8x64xf32, strided<[64, 1], offset: ?>, 2>, memref<64x8xf32, strided<[16, 1], offset: ?>, 2>, memref<8x8xf32, strided<[16, 1], offset: ?>, 2>
+# CHECK-NEXT:            kvxpe.launch_terminator
+# CHECK-NEXT:          }
+# CHECK-NEXT:          kvxpe.await_all
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<16x64xf32> {llvm.noalias}, %arg1: memref<64x16xf32> {llvm.noalias}, %arg2: memref<16x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:      mppa.launch(k300) {
+# CHECK-NEXT:        %0 = mppa.alloc : memref<16x16xf32, 2>
+# CHECK-NEXT:        mppa.copy %arg2, %0 : memref<16x16xf32> to memref<16x16xf32, 2>
+# CHECK-NEXT:        %1 = mppa.alloc : memref<16x64xf32, 2>
+# CHECK-NEXT:        mppa.copy %arg0, %1 : memref<16x64xf32> to memref<16x64xf32, 2>
+# CHECK-NEXT:        %2 = mppa.alloc : memref<64x16xf32, 2>
+# CHECK-NEXT:        mppa.copy %arg1, %2 : memref<64x16xf32> to memref<64x16xf32, 2>
+# CHECK-NEXT:        kvxcluster.launch (nclusters=1, mask=1) 
+# CHECK-NEXT:          0 -> @kvxcluster_launch_0_kernel_cc_0
+# CHECK-NEXT:          with (%0, %1, %2) : memref<16x16xf32, 2>, memref<16x64xf32, 2>, memref<64x16xf32, 2>
+# CHECK-NEXT:        kvxcluster.await_all
+# CHECK-NEXT:        mppa.dealloc %2 : memref<64x16xf32, 2>
+# CHECK-NEXT:        mppa.dealloc %1 : memref<16x64xf32, 2>
+# CHECK-NEXT:        mppa.copy %0, %arg2 : memref<16x16xf32, 2> to memref<16x16xf32>
+# CHECK-NEXT:        mppa.dealloc %0 : memref<16x16xf32, 2>
+# CHECK-NEXT:        kvxcluster.await_all
+# CHECK-NEXT:      }
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  
+# CHECK-NEXT:  graph:
+# CHECK-NEXT:    name: matmul
+# CHECK-NEXT:    inputs:
+# CHECK-NEXT:    - %0 : 16x64xfloat32
+# CHECK-NEXT:    - %1 : 64x16xfloat32
+# CHECK-NEXT:    outputs:
+# CHECK-NEXT:    - %2 : 16x16xfloat32
+# CHECK-NEXT:    nodes:
+# CHECK-NEXT:    - %2: matmul(%0, %1) {name = 'C'} : [16x64xfloat32, 64x16xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT:  CODE: 0
diff --git a/tests/filecheck/backends/target_mppa/test_matmul_mlir_mppa.py b/tests/filecheck/backends/target_mppa/test_matmul_mlir_mppa.py
new file mode 100644
index 000000000..f3115e667
--- /dev/null
+++ b/tests/filecheck/backends/target_mppa/test_matmul_mlir_mppa.py
@@ -0,0 +1,149 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir_mppa
+# REQUIRES: mlir-target=mppa
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend
+
+from xtc.runtimes.accelerator.mppa import MppaDevice
+
+I, J, K, dtype = 4, 8, 16, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph)
+
+sch = impl.get_scheduler()
+sch.define_memory_mesh(axes={"mx": 1, "my": 1})
+sch.define_processor_mesh(axes={"px": 1, "py": 1, "psx": 2, "psy": 8})
+sch.tile("i", {"i1": 2})
+sch.pack_at("i1", 1)
+sched = sch.schedule()
+
+# Create mppa device
+mppa = MppaDevice()
+
+comp = impl.get_compiler(
+    target=mppa,
+    shared_lib=True,
+    dump_file="matmul_mlir_mppa",
+    print_source_ir=True,
+    print_transformed_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK:       // -----// IR Dump Before transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias}, %arg2: memref<4x8xf32> {llvm.noalias}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x8xf32>)
+# CHECK-NEXT:      linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x16xf32>, memref<16x8xf32>) outs(%arg2 : memref<4x8xf32>)
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:      transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:      %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=1, "my"=1]> : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=1, "py"=1, "psx"=2, "psy"=8]> from "memory_mesh" : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op_6 {
+# CHECK-NEXT:        transform.apply_patterns.memref.fold_memref_alias_ops
+# CHECK-NEXT:      } : !transform.any_op
+# CHECK-NEXT:      %4 = transform.sdist.local_buffer_at %tiled_linalg_op_6 tensor 1 : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  // -----// IR Dump After transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    sdist.processor_mesh @processor_mesh from @memory_mesh = <["px"=1, "py"=1, "psx"=2, "psy"=8]>
+# CHECK-NEXT:    sdist.memory_mesh @memory_mesh = <["mx"=1, "my"=1]>
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias}, %arg2: memref<4x8xf32> {llvm.noalias}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c4 = arith.constant 4 : index
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
+# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 8] [1, 1] : memref<4x8xf32> to memref<1x8xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        %c8 = arith.constant 8 : index
+# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c8 step %c1_3 {
+# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x8xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[8, 1], offset: ?>>)
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:      %c2 = arith.constant 2 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c4_1 step %c2 {
+# CHECK-NEXT:        %subview = memref.subview %arg0[%arg3, 0] [2, 16] [1, 1] : memref<4x16xf32> to memref<2x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_2 = memref.subview %arg1[0, 0] [16, 8] [1, 1] : memref<16x8xf32> to memref<16x8xf32, strided<[8, 1]>>
+# CHECK-NEXT:        %subview_3 = memref.subview %arg2[%arg3, 0] [2, 8] [1, 1] : memref<4x8xf32> to memref<2x8xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:        %c8 = arith.constant 8 : index
+# CHECK-NEXT:        %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_4 to %c8 step %c1_5 {
+# CHECK-NEXT:          %subview_6 = memref.subview %subview[0, 0] [2, 16] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_7 = memref.subview %subview_2[0, %arg4] [16, 1] [1, 1] : memref<16x8xf32, strided<[8, 1]>> to memref<16x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:          %subview_8 = memref.subview %subview_3[0, %arg4] [2, 1] [1, 1] : memref<2x8xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:          %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:          %c16 = arith.constant 16 : index
+# CHECK-NEXT:          %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_9 to %c16 step %c1_10 {
+# CHECK-NEXT:            %subview_11 = memref.subview %subview_6[0, %arg5] [2, 1] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_12 = memref.subview %subview_7[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:            %subview_13 = memref.subview %subview_8[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:            %alloc = memref.alloc() : memref<1x1xf32, 2>
+# CHECK-NEXT:            %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:            sdist.read %subview_7[%arg5, %c0_14] to %alloc : memref<16x1xf32, strided<[8, 1], offset: ?>>, memref<1x1xf32, 2>
+# CHECK-NEXT:            %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:            %c2_16 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_15 to %c2_16 step %c1_17 {
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_11[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_19 = memref.subview %alloc[0, 0] [1, 1] [1, 1] : memref<1x1xf32, 2> to memref<1x1xf32, strided<[1, 1]>, 2>
+# CHECK-NEXT:              %subview_20 = memref.subview %subview_13[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_18, %subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[1, 1]>, 2>) outs(%subview_20 : memref<1x1xf32, strided<[8, 1], offset: ?>>)
+# CHECK-NEXT:            } {"./i1"}
+# CHECK-NEXT:          } {"./k"}
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  graph:
+# CHECK-NEXT:    name: matmul
+# CHECK-NEXT:    inputs:
+# CHECK-NEXT:    - %0 : 4x16xfloat32
+# CHECK-NEXT:    - %1 : 16x8xfloat32
+# CHECK-NEXT:    outputs:
+# CHECK-NEXT:    - %2 : 4x8xfloat32
+# CHECK-NEXT:    nodes:
+# CHECK-NEXT:    - %2: matmul(%0, %1) {name = 'C'} : [4x16xfloat32, 16x8xfloat32] -> [4x8xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT:  CODE: 0
diff --git a/tests/filecheck/backends/target_mppa/test_matmul_mlir_offload_tensor.py b/tests/filecheck/backends/target_mppa/test_matmul_mlir_offload_tensor.py
new file mode 100644
index 000000000..677702c41
--- /dev/null
+++ b/tests/filecheck/backends/target_mppa/test_matmul_mlir_offload_tensor.py
@@ -0,0 +1,149 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir_mppa
+# REQUIRES: mlir-target=mppa
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend
+
+from xtc.runtimes.accelerator.mppa import MppaDevice
+
+# Create device
+mppa = MppaDevice()
+
+I, J, K, dtype = 4, 8, 16, "float32"
+a = O.tensor((I, K), dtype, name="A") # A live son the host
+b = O.tensor((K, J), dtype, name="B", device=mppa) # B lives on the accelerator
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C", device=mppa) # C msut lives on the accelerator
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph)
+
+sch = impl.get_scheduler()
+sch.define_memory_mesh(axes={"mx": 1, "my": 1})
+sch.define_processor_mesh(axes={"px": 1, "py": 1, "psx": 2, "psy": 8})
+sch.tile("i", {"i1": 2})
+sch.pack_at("i1", 1)
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    target=mppa,
+    shared_lib=True,
+    dump_file="matmul_mlir_offload_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK:       // -----// IR Dump Before transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x8xf32> {llvm.noalias, memref.on_device}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x8xf32>)
+# CHECK-NEXT:      linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x16xf32>, memref<16x8xf32>) outs(%arg2 : memref<4x8xf32>)
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:      transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:      %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=1, "my"=1]> : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=1, "py"=1, "psx"=2, "psy"=8]> from "memory_mesh" : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op_6 {
+# CHECK-NEXT:        transform.apply_patterns.memref.fold_memref_alias_ops
+# CHECK-NEXT:      } : !transform.any_op
+# CHECK-NEXT:      %4 = transform.sdist.local_buffer_at %tiled_linalg_op_6 tensor 1 : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
+# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  // -----// IR Dump After transform //----- //
+# CHECK-NEXT:  module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:    sdist.processor_mesh @processor_mesh from @memory_mesh = <["px"=1, "py"=1, "psx"=2, "psy"=8]>
+# CHECK-NEXT:    sdist.memory_mesh @memory_mesh = <["mx"=1, "my"=1]>
+# CHECK-NEXT:    func.func @matmul(%arg0: memref<4x16xf32> {llvm.noalias}, %arg1: memref<16x8xf32> {llvm.noalias, memref.on_device}, %arg2: memref<4x8xf32> {llvm.noalias, memref.on_device}) {
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c4 = arith.constant 4 : index
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
+# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 8] [1, 1] : memref<4x8xf32> to memref<1x8xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        %c8 = arith.constant 8 : index
+# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c8 step %c1_3 {
+# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x8xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[8, 1], offset: ?>>)
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:      %c2 = arith.constant 2 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c4_1 step %c2 {
+# CHECK-NEXT:        %subview = memref.subview %arg0[%arg3, 0] [2, 16] [1, 1] : memref<4x16xf32> to memref<2x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_2 = memref.subview %arg1[0, 0] [16, 8] [1, 1] : memref<16x8xf32> to memref<16x8xf32, strided<[8, 1]>>
+# CHECK-NEXT:        %subview_3 = memref.subview %arg2[%arg3, 0] [2, 8] [1, 1] : memref<4x8xf32> to memref<2x8xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:        %c8 = arith.constant 8 : index
+# CHECK-NEXT:        %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_4 to %c8 step %c1_5 {
+# CHECK-NEXT:          %subview_6 = memref.subview %subview[0, 0] [2, 16] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_7 = memref.subview %subview_2[0, %arg4] [16, 1] [1, 1] : memref<16x8xf32, strided<[8, 1]>> to memref<16x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:          %subview_8 = memref.subview %subview_3[0, %arg4] [2, 1] [1, 1] : memref<2x8xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:          %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:          %c16 = arith.constant 16 : index
+# CHECK-NEXT:          %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_9 to %c16 step %c1_10 {
+# CHECK-NEXT:            %subview_11 = memref.subview %subview_6[0, %arg5] [2, 1] [1, 1] : memref<2x16xf32, strided<[16, 1], offset: ?>> to memref<2x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_12 = memref.subview %subview_7[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:            %subview_13 = memref.subview %subview_8[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<2x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:            %alloc = memref.alloc() : memref<1x1xf32, 2>
+# CHECK-NEXT:            %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:            sdist.read %subview_7[%arg5, %c0_14] to %alloc : memref<16x1xf32, strided<[8, 1], offset: ?>>, memref<1x1xf32, 2>
+# CHECK-NEXT:            %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:            %c2_16 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_15 to %c2_16 step %c1_17 {
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_11[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_19 = memref.subview %alloc[0, 0] [1, 1] [1, 1] : memref<1x1xf32, 2> to memref<1x1xf32, strided<[1, 1]>, 2>
+# CHECK-NEXT:              %subview_20 = memref.subview %subview_13[%arg6, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[8, 1], offset: ?>> to memref<1x1xf32, strided<[8, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_18, %subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[1, 1]>, 2>) outs(%subview_20 : memref<1x1xf32, strided<[8, 1], offset: ?>>)
+# CHECK-NEXT:            } {"./i1"}
+# CHECK-NEXT:          } {"./k"}
+# CHECK-NEXT:        } {"./j"}
+# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      return
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:  
+# CHECK-NEXT:  graph:
+# CHECK-NEXT:    name: matmul
+# CHECK-NEXT:    inputs:
+# CHECK-NEXT:    - %0 : 4x16xfloat32
+# CHECK-NEXT:    - %1 : 16x8xfloat32
+# CHECK-NEXT:    outputs:
+# CHECK-NEXT:    - %2 : 4x8xfloat32
+# CHECK-NEXT:    nodes:
+# CHECK-NEXT:    - %2: matmul(%0, %1) {name = 'C'} : [4x16xfloat32, 16x8xfloat32] -> [4x8xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT:  CODE: 0
diff --git a/tests/filecheck/lit.cfg b/tests/filecheck/lit.cfg
index 2743d0846..2b276d2c1 100644
--- a/tests/filecheck/lit.cfg
+++ b/tests/filecheck/lit.cfg
@@ -12,7 +12,7 @@ config.environment["TIMEOUT"] = "10"
 
 config.available_features = set()
 
-for module in ["mlir", "tvm", "jir", "mlir_sdist"]:
+for module in ["mlir", "tvm", "jir", "mlir_sdist", "mlir_mppa"]:
     if importlib.util.find_spec(module) is not None:
         config.available_features.add(f"module_{module}")
     else:
@@ -28,6 +28,7 @@ env_passthrough = [
     "PYTHONPATH",
     "XTC_MLIR_TARGET",
     "XTC_MLIR_PREFIX",
+    "KALRAY_TOOLCHAIN_DIR"
 ]
 config.environment.update({
     var: os.environ[var]