Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 181 additions & 0 deletions python/examples/workload/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""
Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
"""
import numpy as np
from mlir import ir
from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
from mlir.dialects import func, linalg, bufferization
from mlir.dialects import transform
from functools import cached_property
from lighthouse import Workload
from lighthouse.utils.mlir import (
apply_registered_pass,
canonicalize,
cse,
match,
)
from lighthouse.utils.execution import (
lower_payload,
execute,
benchmark,
)


class ElementwiseSum(Workload):
"""
Computes element-wise sum of (M, N) float32 arrays on CPU.

We can construct the input arrays and compute the reference solution in
Python with Numpy.

We use @cached_property to store the inputs and reference solution in the
object so that they are only computed once.
"""

def __init__(self, M, N):
self.M = M
self.N = N
self.dtype = np.float32
self.context = ir.Context()
self.location = ir.Location.unknown(context=self.context)

@cached_property
def _input_arrays(self):
print(" * Generating input arrays...")
np.random.seed(2)
A = np.random.rand(self.M, self.N).astype(self.dtype)
B = np.random.rand(self.M, self.N).astype(self.dtype)
C = np.zeros((self.M, self.N), dtype=self.dtype)
return [A, B, C]

@cached_property
def _reference_solution(self):
print(" * Computing reference solution...")
A, B, _ = self._input_arrays
return A + B

def get_input_arrays(self, execution_engine):
return [
get_ranked_memref_descriptor(a) for a in self._input_arrays
]

def verify(self, execution_engine, verbose: int = 0) -> bool:
C = self._input_arrays[2]
C_ref = self._reference_solution
if verbose > 1:
print("Reference solution:")
print(C_ref)
print("Computed solution:")
print(C)
success = np.allclose(C, C_ref)
if verbose:
if success:
print("PASSED")
else:
print("FAILED Result mismatch!")
return success

def requirements(self):
return []

def get_complexity(self):
nbytes = np.dtype(self.dtype).itemsize
flop_count = self.M * self.N # one addition per element
memory_reads = 2 * self.M * self.N * nbytes # read A and B
memory_writes = self.M * self.N * nbytes # write C
return (flop_count, memory_reads, memory_writes)

def payload_module(self):
with self.context, self.location:
float32_t = ir.F32Type.get()
shape = (self.M, self.N)
tensor_t = ir.RankedTensorType.get(shape, float32_t)
memref_t = ir.MemRefType.get(shape, float32_t)
mod = ir.Module.create()
with ir.InsertionPoint(mod.body):
args = [memref_t, memref_t, memref_t]
f = func.FuncOp(self.payload_function_name, (tuple(args), ()))
f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
with ir.InsertionPoint(f.add_entry_block()):
A = f.arguments[0]
B = f.arguments[1]
C = f.arguments[2]
a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True)
b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True)
c_tensor = bufferization.ToTensorOp(
tensor_t, C, restrict=True, writable=True
)
add = linalg.add(a_tensor, b_tensor, outs=[c_tensor])
bufferization.MaterializeInDestinationOp(
None, add, C, restrict=True, writable=True
)
func.ReturnOp(())
return mod

def schedule_module(self, dump_kernel=None, parameters=None):
with self.context, self.location:
schedule_module = ir.Module.create()
schedule_module.operation.attributes[
"transform.with_named_sequence"] = (ir.UnitAttr.get())
with ir.InsertionPoint(schedule_module.body):
named_sequence = transform.NamedSequenceOp(
"__transform_main",
[transform.AnyOpType.get()],
[],
arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}],
)
with ir.InsertionPoint(named_sequence.body):
anytype = transform.AnyOpType.get()
func = match(named_sequence.bodyTarget, ops={"func.func"})
mod = transform.get_parent_op(
anytype,
func,
op_name="builtin.module",
deduplicate=True,
)
mod = apply_registered_pass(mod, "one-shot-bufferize")
mod = apply_registered_pass(mod, "convert-linalg-to-loops")
cse(mod)
canonicalize(mod)

if dump_kernel == "bufferized":
transform.YieldOp()
return schedule_module

mod = apply_registered_pass(mod, "convert-scf-to-cf")
mod = apply_registered_pass(mod, "finalize-memref-to-llvm")
mod = apply_registered_pass(mod, "convert-cf-to-llvm")
mod = apply_registered_pass(mod, "convert-arith-to-llvm")
mod = apply_registered_pass(mod, "convert-func-to-llvm")
mod = apply_registered_pass(mod,
"reconcile-unrealized-casts")
transform.YieldOp()

return schedule_module


if __name__ == "__main__":
wload = ElementwiseSum(400, 400)

print(" Dump kernel ".center(60, "-"))
lower_payload(wload, dump_kernel="bufferized", dump_schedule=True)

print(" Execute 1 ".center(60, "-"))
execute(wload, verbose=2)

print(" Execute 2 ".center(60, "-"))
execute(wload, verbose=1)

print(" Benchmark ".center(60, "-"))
times = benchmark(wload)
times *= 1e6 # convert to microseconds
# compute statistics
mean = np.mean(times)
min = np.min(times)
max = np.max(times)
std = np.std(times)
print(f"Timings (us): "
f"mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
flop_count = wload.get_complexity()[0]
gflops = flop_count / (mean * 1e-6) / 1e9
print(f"Throughput: {gflops:.2f} GFLOPS")
217 changes: 217 additions & 0 deletions python/examples/workload/example_mlir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
"""
Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.

In this example, allocation and deallocation of input arrays is done in MLIR.
"""
import numpy as np
from mlir import ir
from mlir.runtime.np_to_memref import (
ranked_memref_to_numpy,
make_nd_memref_descriptor,
as_ctype,
)
from mlir.dialects import func, linalg, bufferization, arith, memref
from mlir.dialects import transform
import ctypes
from contextlib import contextmanager
from lighthouse import Workload
from lighthouse.utils.mlir import (
apply_registered_pass,
canonicalize,
cse,
match,
)
from lighthouse.utils import get_packed_arg
from lighthouse.utils.execution import (
lower_payload,
execute,
benchmark,
)
from example import ElementwiseSum


def emit_host_alloc(mod, suffix, element_type, rank=2):
dyn = ir.ShapedType.get_dynamic_size()
memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
index_t = ir.IndexType.get()
i32_t = ir.IntegerType.get_signless(32)
with ir.InsertionPoint(mod.body):
f = func.FuncOp(
"host_alloc_" + suffix, (rank*(i32_t,), (memref_dyn_t,))
)
f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
with ir.InsertionPoint(f.add_entry_block()):
dims = [
arith.IndexCastOp(index_t, a) for a in list(f.arguments)
]
alloc = memref.alloc(memref_dyn_t, dims, [])
func.ReturnOp((alloc,))


def emit_host_dealloc(mod, suffix, element_type, rank=2):
dyn = ir.ShapedType.get_dynamic_size()
memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
with ir.InsertionPoint(mod.body):
f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ()))
f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
with ir.InsertionPoint(f.add_entry_block()):
memref.dealloc(f.arguments[0])
func.ReturnOp(())


def emit_fill_constant(mod, suffix, value, element_type, rank=2):
dyn = ir.ShapedType.get_dynamic_size()
memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
with ir.InsertionPoint(mod.body):
f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ()))
f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
with ir.InsertionPoint(f.add_entry_block()):
const = arith.constant(element_type, value)
linalg.fill(const, outs=[f.arguments[0]])
func.ReturnOp(())


def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2):
rank = 2
dyn = ir.ShapedType.get_dynamic_size()
memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type)
i32_t = ir.IntegerType.get_signless(32)
f64_t = ir.F64Type.get()
with ir.InsertionPoint(mod.body):
f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ()))
f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
with ir.InsertionPoint(f.add_entry_block()):
min_cst = arith.constant(f64_t, min)
max_cst = arith.constant(f64_t, max)
seed_cst = arith.constant(i32_t, seed)
linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]])
func.ReturnOp(())


class ElementwiseSumMLIRAlloc(ElementwiseSum):
"""
Computes element-wise sum of (M, N) float32 arrays on CPU.

Extends ElementwiseSum by allocating input arrays in MLIR.
"""

def __init__(self, M, N):
super().__init__(M, N)
# keep track of allocated memrefs
self.memrefs = {}

def _allocate_array(self, name, execution_engine):
if name in self.memrefs:
return self.memrefs[name]
alloc_func = execution_engine.lookup("host_alloc_f32")
shape = (self.M, self.N)
mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))()
ptr_mref = ctypes.pointer(ctypes.pointer(mref))
ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape]
alloc_func(get_packed_arg([ptr_mref, *ptr_dims]))
self.memrefs[name] = mref
return mref

def _allocate_inputs(self, execution_engine):
self._allocate_array("A", execution_engine)
self._allocate_array("B", execution_engine)
self._allocate_array("C", execution_engine)

def _deallocate_all(self, execution_engine):
for mref in self.memrefs.values():
dealloc_func = execution_engine.lookup("host_dealloc_f32")
ptr_mref = ctypes.pointer(ctypes.pointer(mref))
dealloc_func(get_packed_arg([ptr_mref]))
self.memrefs = {}

@contextmanager
def allocate(self, execution_engine):
try:
self._allocate_inputs(execution_engine)
yield None
finally:
self._deallocate_all(execution_engine)

def get_input_arrays(self, execution_engine):
A = self._allocate_array("A", execution_engine)
B = self._allocate_array("B", execution_engine)
C = self._allocate_array("C", execution_engine)

# initialize with MLIR
fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32")
fill_random_func = execution_engine.lookup("host_fill_random_f32")
fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))]))
fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))]))
fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))]))

return [A, B, C]

def verify(self, execution_engine, verbose: int = 0) -> bool:
# compute reference solution with numpy
A = ranked_memref_to_numpy([self.memrefs["A"]])
B = ranked_memref_to_numpy([self.memrefs["B"]])
C = ranked_memref_to_numpy([self.memrefs["C"]])
C_ref = A + B
if verbose > 1:
print("Reference solution:")
print(C_ref)
print("Computed solution:")
print(C)
success = np.allclose(C, C_ref)

# Alternatively we could have done the verification in MLIR by emitting
# a check function.
# Here we just call the payload function again.
# self._allocate_array("C_ref", execution_engine)
# func = execution_engine.lookup("payload")
# func(get_packed_arg([
# ctypes.pointer(ctypes.pointer(self.memrefs["A"])),
# ctypes.pointer(ctypes.pointer(self.memrefs["B"])),
# ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])),
# ]))
# Check correctness with numpy.
# C = ranked_memref_to_numpy([self.memrefs["C"]])
# C_ref = ranked_memref_to_numpy([self.memrefs["C_ref"]])
# success = np.allclose(C, C_ref)

if verbose:
if success:
print("PASSED")
else:
print("FAILED Result mismatch!")
return success

def payload_module(self):
mod = super().payload_module()
# extend the payload module with de/alloc/fill functions
with self.context, self.location:
float32_t = ir.F32Type.get()
emit_host_alloc(mod, "f32", float32_t)
emit_host_dealloc(mod, "f32", float32_t)
emit_fill_constant(mod, "zero_f32", 0.0, float32_t)
emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0)
return mod


if __name__ == "__main__":
wload = ElementwiseSumMLIRAlloc(400, 400)

print(" Dump kernel ".center(60, "-"))
lower_payload(wload, dump_kernel="bufferized", dump_schedule=False)

print(" Execute ".center(60, "-"))
execute(wload, verbose=2)

print(" Benchmark ".center(60, "-"))
times = benchmark(wload)
times *= 1e6 # convert to microseconds
# compute statistics
mean = np.mean(times)
min = np.min(times)
max = np.max(times)
std = np.std(times)
print(f"Timings (us): "
f"mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
flop_count = wload.get_complexity()[0]
gflops = flop_count / (mean * 1e-6) / 1e9
print(f"Throughput: {gflops:.2f} GFLOPS")
Loading