llvm · tkarna · Nov 18, 2025
diff --git a/python/examples/workload/example.py b/python/examples/workload/example.py
@@ -0,0 +1,181 @@
+"""
+Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
+"""
+import numpy as np
+from mlir import ir
+from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
+from mlir.dialects import func, linalg, bufferization
+from mlir.dialects import transform
+from functools import cached_property
+from lighthouse import Workload
+from lighthouse.utils.mlir import (
+    apply_registered_pass,
+    canonicalize,
+    cse,
+    match,
+)
+from lighthouse.utils.execution import (
+    lower_payload,
+    execute,
+    benchmark,
+)
+
+
+class ElementwiseSum(Workload):
+    """
+    Computes element-wise sum of (M, N) float32 arrays on CPU.
+
+    We can construct the input arrays and compute the reference solution in
+    Python with Numpy.
+
+    We use @cached_property to store the inputs and reference solution in the
+    object so that they are only computed once.
+    """
+
+    def __init__(self, M, N):
+        self.M = M
+        self.N = N
+        self.dtype = np.float32
+        self.context = ir.Context()
+        self.location = ir.Location.unknown(context=self.context)
+
+    @cached_property
+    def _input_arrays(self):
+        print(" * Generating input arrays...")
+        np.random.seed(2)
+        A = np.random.rand(self.M, self.N).astype(self.dtype)
+        B = np.random.rand(self.M, self.N).astype(self.dtype)
+        C = np.zeros((self.M, self.N), dtype=self.dtype)
+        return [A, B, C]
+
+    @cached_property
+    def _reference_solution(self):
+        print(" * Computing reference solution...")
+        A, B, _ = self._input_arrays
+        return A + B
+
+    def get_input_arrays(self, execution_engine):
+        return [
+            get_ranked_memref_descriptor(a) for a in self._input_arrays
+        ]
+
+    def verify(self, execution_engine, verbose: int = 0) -> bool:
+        C = self._input_arrays[2]
+        C_ref = self._reference_solution
+        if verbose > 1:
+            print("Reference solution:")
+            print(C_ref)
+            print("Computed solution:")
+            print(C)
+        success = np.allclose(C, C_ref)
+        if verbose:
+            if success:
+                print("PASSED")
+            else:
+                print("FAILED Result mismatch!")
+        return success
+
+    def requirements(self):
+        return []
+
+    def get_complexity(self):
+        nbytes = np.dtype(self.dtype).itemsize
+        flop_count = self.M * self.N  # one addition per element
+        memory_reads = 2 * self.M * self.N * nbytes  # read A and B
+        memory_writes = self.M * self.N * nbytes  # write C
+        return (flop_count, memory_reads, memory_writes)
+
+    def payload_module(self):
+        with self.context, self.location:
+            float32_t = ir.F32Type.get()
+            shape = (self.M, self.N)
+            tensor_t = ir.RankedTensorType.get(shape, float32_t)
+            memref_t = ir.MemRefType.get(shape, float32_t)
+            mod = ir.Module.create()
+            with ir.InsertionPoint(mod.body):
+                args = [memref_t, memref_t, memref_t]
+                f = func.FuncOp(self.payload_function_name, (tuple(args), ()))
+                f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+            with ir.InsertionPoint(f.add_entry_block()):
+                A = f.arguments[0]
+                B = f.arguments[1]
+                C = f.arguments[2]
+                a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True)
+                b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True)
+                c_tensor = bufferization.ToTensorOp(
+                    tensor_t, C, restrict=True, writable=True
+                )
+                add = linalg.add(a_tensor, b_tensor, outs=[c_tensor])
+                bufferization.MaterializeInDestinationOp(
+                    None, add, C, restrict=True, writable=True
+                )
+                func.ReturnOp(())
+        return mod
+
+    def schedule_module(self, dump_kernel=None, parameters=None):
+        with self.context, self.location:
+            schedule_module = ir.Module.create()
+            schedule_module.operation.attributes[
+                "transform.with_named_sequence"] = (ir.UnitAttr.get())
+            with ir.InsertionPoint(schedule_module.body):
+                named_sequence = transform.NamedSequenceOp(
+                    "__transform_main",
+                    [transform.AnyOpType.get()],
+                    [],
+                    arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}],
+                )
+                with ir.InsertionPoint(named_sequence.body):
+                    anytype = transform.AnyOpType.get()
+                    func = match(named_sequence.bodyTarget, ops={"func.func"})
+                    mod = transform.get_parent_op(
+                        anytype,
+                        func,
+                        op_name="builtin.module",
+                        deduplicate=True,
+                    )
+                    mod = apply_registered_pass(mod, "one-shot-bufferize")
+                    mod = apply_registered_pass(mod, "convert-linalg-to-loops")
+                    cse(mod)
+                    canonicalize(mod)
+
+                    if dump_kernel == "bufferized":
+                        transform.YieldOp()
+                        return schedule_module
+
+                    mod = apply_registered_pass(mod, "convert-scf-to-cf")
+                    mod = apply_registered_pass(mod, "finalize-memref-to-llvm")
+                    mod = apply_registered_pass(mod, "convert-cf-to-llvm")
+                    mod = apply_registered_pass(mod, "convert-arith-to-llvm")
+                    mod = apply_registered_pass(mod, "convert-func-to-llvm")
+                    mod = apply_registered_pass(mod,
+                                                "reconcile-unrealized-casts")
+                    transform.YieldOp()
+
+        return schedule_module
+
+
+if __name__ == "__main__":
+    wload = ElementwiseSum(400, 400)
+
+    print(" Dump kernel ".center(60, "-"))
+    lower_payload(wload, dump_kernel="bufferized", dump_schedule=True)
+
+    print(" Execute 1 ".center(60, "-"))
+    execute(wload, verbose=2)
+
+    print(" Execute 2 ".center(60, "-"))
+    execute(wload, verbose=1)
+
+    print(" Benchmark ".center(60, "-"))
+    times = benchmark(wload)
+    times *= 1e6  # convert to microseconds
+    # compute statistics
+    mean = np.mean(times)
+    min = np.min(times)
+    max = np.max(times)
+    std = np.std(times)
+    print(f"Timings (us): "
+          f"mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+    flop_count = wload.get_complexity()[0]
+    gflops = flop_count / (mean * 1e-6) / 1e9
+    print(f"Throughput: {gflops:.2f} GFLOPS")
diff --git a/python/examples/workload/example_mlir.py b/python/examples/workload/example_mlir.py
@@ -0,0 +1,217 @@
+"""
+Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
+
+In this example, allocation and deallocation of input arrays is done in MLIR.
+"""
+import numpy as np
+from mlir import ir
+from mlir.runtime.np_to_memref import (
+    ranked_memref_to_numpy,
+    make_nd_memref_descriptor,
+    as_ctype,
+)
+from mlir.dialects import func, linalg, bufferization, arith, memref
+from mlir.dialects import transform
+import ctypes
+from contextlib import contextmanager
+from lighthouse import Workload
+from lighthouse.utils.mlir import (
+    apply_registered_pass,
+    canonicalize,
+    cse,
+    match,
+)
+from lighthouse.utils import get_packed_arg
+from lighthouse.utils.execution import (
+    lower_payload,
+    execute,
+    benchmark,
+)
+from example import ElementwiseSum
+
+
+def emit_host_alloc(mod, suffix, element_type, rank=2):
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
+    index_t = ir.IndexType.get()
+    i32_t = ir.IntegerType.get_signless(32)
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp(
+            "host_alloc_" + suffix, (rank*(i32_t,), (memref_dyn_t,))
+        )
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        dims = [
+            arith.IndexCastOp(index_t, a) for a in list(f.arguments)
+        ]
+        alloc = memref.alloc(memref_dyn_t, dims, [])
+        func.ReturnOp((alloc,))
+
+
+def emit_host_dealloc(mod, suffix, element_type, rank=2):
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ()))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        memref.dealloc(f.arguments[0])
+        func.ReturnOp(())
+
+
+def emit_fill_constant(mod, suffix, value, element_type, rank=2):
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ()))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        const = arith.constant(element_type, value)
+        linalg.fill(const, outs=[f.arguments[0]])
+        func.ReturnOp(())
+
+
+def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2):
+    rank = 2
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
+    i32_t = ir.IntegerType.get_signless(32)
+    f64_t = ir.F64Type.get()
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ()))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        min_cst = arith.constant(f64_t, min)
+        max_cst = arith.constant(f64_t, max)
+        seed_cst = arith.constant(i32_t, seed)
+        linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]])
+        func.ReturnOp(())
+
+
+class ElementwiseSumMLIRAlloc(ElementwiseSum):
+    """
+    Computes element-wise sum of (M, N) float32 arrays on CPU.
+
+    Extends ElementwiseSum by allocating input arrays in MLIR.
+    """
+
+    def __init__(self, M, N):
+        super().__init__(M, N)
+        # keep track of allocated memrefs
+        self.memrefs = {}
+
+    def _allocate_array(self, name, execution_engine):
+        if name in self.memrefs:
+            return self.memrefs[name]
+        alloc_func = execution_engine.lookup("host_alloc_f32")
+        shape = (self.M, self.N)
+        mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))()
+        ptr_mref = ctypes.pointer(ctypes.pointer(mref))
+        ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape]
+        alloc_func(get_packed_arg([ptr_mref, *ptr_dims]))
+        self.memrefs[name] = mref
+        return mref
+
+    def _allocate_inputs(self, execution_engine):
+        self._allocate_array("A", execution_engine)
+        self._allocate_array("B", execution_engine)
+        self._allocate_array("C", execution_engine)
+
+    def _deallocate_all(self, execution_engine):
+        for mref in self.memrefs.values():
+            dealloc_func = execution_engine.lookup("host_dealloc_f32")
+            ptr_mref = ctypes.pointer(ctypes.pointer(mref))
+            dealloc_func(get_packed_arg([ptr_mref]))
+        self.memrefs = {}
+
+    @contextmanager
+    def allocate(self, execution_engine):
+        try:
+            self._allocate_inputs(execution_engine)
+            yield None
+        finally:
+            self._deallocate_all(execution_engine)
+
+    def get_input_arrays(self, execution_engine):
+        A = self._allocate_array("A", execution_engine)
+        B = self._allocate_array("B", execution_engine)
+        C = self._allocate_array("C", execution_engine)
+
+        # initialize with MLIR
+        fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32")
+        fill_random_func = execution_engine.lookup("host_fill_random_f32")
+        fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))]))
+        fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))]))
+        fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))]))
+
+        return [A, B, C]
+
+    def verify(self, execution_engine, verbose: int = 0) -> bool:
+        # compute reference solution with numpy
+        A = ranked_memref_to_numpy([self.memrefs["A"]])
+        B = ranked_memref_to_numpy([self.memrefs["B"]])
+        C = ranked_memref_to_numpy([self.memrefs["C"]])
+        C_ref = A + B
+        if verbose > 1:
+            print("Reference solution:")
+            print(C_ref)
+            print("Computed solution:")
+            print(C)
+        success = np.allclose(C, C_ref)
+
+        # Alternatively we could have done the verification in MLIR by emitting
+        # a check function.
+        # Here we just call the payload function again.
+        # self._allocate_array("C_ref", execution_engine)
+        # func = execution_engine.lookup("payload")
+        # func(get_packed_arg([
+        #     ctypes.pointer(ctypes.pointer(self.memrefs["A"])),
+        #     ctypes.pointer(ctypes.pointer(self.memrefs["B"])),
+        #     ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])),
+        # ]))
+        # Check correctness with numpy.
+        # C = ranked_memref_to_numpy([self.memrefs["C"]])
+        # C_ref = ranked_memref_to_numpy([self.memrefs["C_ref"]])
+        # success = np.allclose(C, C_ref)
+
+        if verbose:
+            if success:
+                print("PASSED")
+            else:
+                print("FAILED Result mismatch!")
+        return success
+
+    def payload_module(self):
+        mod = super().payload_module()
+        # extend the payload module with de/alloc/fill functions
+        with self.context, self.location:
+            float32_t = ir.F32Type.get()
+            emit_host_alloc(mod, "f32", float32_t)
+            emit_host_dealloc(mod, "f32", float32_t)
+            emit_fill_constant(mod, "zero_f32", 0.0, float32_t)
+            emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0)
+        return mod
+
+
+if __name__ == "__main__":
+    wload = ElementwiseSumMLIRAlloc(400, 400)
+
+    print(" Dump kernel ".center(60, "-"))
+    lower_payload(wload, dump_kernel="bufferized", dump_schedule=False)
+
+    print(" Execute ".center(60, "-"))
+    execute(wload, verbose=2)
+
+    print(" Benchmark ".center(60, "-"))
+    times = benchmark(wload)
+    times *= 1e6  # convert to microseconds
+    # compute statistics
+    mean = np.mean(times)
+    min = np.min(times)
+    max = np.max(times)
+    std = np.std(times)
+    print(f"Timings (us): "
+          f"mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+    flop_count = wload.get_complexity()[0]
+    gflops = flop_count / (mean * 1e-6) / 1e9
+    print(f"Throughput: {gflops:.2f} GFLOPS")