diff --git a/bench-level1.mojo b/bench-level1.mojo index a2c1a24..ed3372e 100644 --- a/bench-level1.mojo +++ b/bench-level1.mojo @@ -1,82 +1,458 @@ from gpu.host import DeviceContext -from sys import has_accelerator +from sys import has_accelerator, argv from time import monotonic - -# importing test wrappers +from math import sin, cos from src import * -from random import rand, seed + +# Reference: https://github.com/icl-utk-edu/blaspp/blob/master/test/run_tests.py comptime WARMUP = 10 -# -def fill_random[dtype: DType]( - a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - n: Int -): - rand[dtype](a, n) def bytes_per_elem(dtype: DType) -> Int: - if dtype == DType.float16: - return 2 if dtype == DType.float32: return 4 if dtype == DType.float64: return 8 return 0 -# -def bench_copy[dtype: DType](n: Int, iters: Int): - with DeviceContext() as ctx: - x_h = ctx.enqueue_create_host_buffer[dtype](n) - y_h = ctx.enqueue_create_host_buffer[dtype](n) - fill_random[dtype](x_h.unsafe_ptr(), n) - fill_random[dtype](y_h.unsafe_ptr(), n) +struct RunParams: + var routines: List[String] + var dtype_str: String + var sizes: List[Int] + var iters: Int + + fn __init__(out self): + self.routines = List[String]() + self.dtype_str = String("all") + self.sizes = List[Int]() + self.iters = 100 + + +def parse_args(mut params: RunParams) -> Bool: + var args = argv() + var n_custom = 0 + + var i = 1 + while i < len(args): + var arg = String(args[i]) + if arg == "--type": + if i + 1 < len(args): + params.dtype_str = String(args[i + 1]) + i += 2 + else: + print("--type requires a value") + return False + elif arg == "--n": + if i + 1 < len(args): + n_custom = Int(args[i + 1]) + i += 2 + else: + print("--n requires a value") + return False + elif arg == "--iters": + if i + 1 < len(args): + params.iters = Int(args[i + 1]) + i += 2 + else: + print("--iters requires a value") + return False + elif not arg.startswith("-"): + params.routines.append(arg) + i += 1 + else: + i += 1 + + if n_custom > 0: + params.sizes.append(n_custom) + else: + params.sizes.append(1024) + params.sizes.append(8192) + params.sizes.append(1048576) + params.sizes.append(8388608) + params.sizes.append(16777216) + + return True + + +def bench_asum[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[dtype](1) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_asum[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_asum[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + var bw_gbs = Float64(n * bytes_per_elem(dtype)) / avg + print("asum,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_axpy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + var alpha = Scalar[dtype](2.0) + + for _ in range(WARMUP): + blas_axpy[dtype](n, alpha, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_axpy[dtype](n, alpha, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + n writes = 3n + var bw_gbs = Float64(3 * n * bytes_per_elem(dtype)) / avg + print("axpy,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_copy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + n writes = 2n + var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg + print("copy,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_dot[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[dtype](1) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_dot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_dot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg + print("dot,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_dotc[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + y_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + generate_random_arr[dtype](2 * n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](2 * n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](2 * n) + y_d = ctx.enqueue_create_buffer[dtype](2 * n) + res_d = ctx.enqueue_create_buffer[dtype](2) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_dotc[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_dotc[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2 vectors * 2n floats = 4n reads + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("dotc,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_dotu[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + y_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + generate_random_arr[dtype](2 * n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](2 * n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](2 * n) + y_d = ctx.enqueue_create_buffer[dtype](2 * n) + res_d = ctx.enqueue_create_buffer[dtype](2) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_dotu[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_dotu[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2 vectors * 2n floats = 4n reads + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("dotu,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + - x_d = ctx.enqueue_create_buffer[dtype](n) - y_d = ctx.enqueue_create_buffer[dtype](n) +def bench_iamax[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[DType.int64](1) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() - ctx.enqueue_copy(x_d, x_h) - ctx.enqueue_copy(y_d, y_h) + for _ in range(WARMUP): + blas_iamax[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) - for _ in range(WARMUP): - blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) - ctx.synchronize() + start = monotonic() + for _ in range(iters): + blas_iamax[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() - total: UInt = 0 - start = monotonic() - for _ in range (iters): - blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) - ctx.synchronize() - end = monotonic() - total += (end - start) + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + var bw_gbs = Float64(n * bytes_per_elem(dtype)) / avg + print("iamax,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") - avg = total / iters - var elem_bytes = bytes_per_elem(dtype) - var bytes_per_call: Float64 = Float64(2 * n * elem_bytes) - var avg_f: Float64 = Float64(avg) - var bw_gbs = bytes_per_call / avg_f +def bench_nrm2[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[dtype](1) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_nrm2[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_nrm2[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + var bw_gbs = Float64(n * bytes_per_elem(dtype)) / avg + print("nrm2,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_rot[dtype: DType](n: Int, iters: Int, ctx: DeviceContext) where dtype.is_floating_point(): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + var angle = generate_random_scalar[dtype](0, 2 * 3.14159265359) + var c = Scalar[dtype](cos(angle)) + var s = Scalar[dtype](sin(angle)) + + for _ in range(WARMUP): + blas_rot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, c, s, ctx) + + start = monotonic() + for _ in range(iters): + blas_rot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, c, s, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + 2n writes = 4n + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("rot,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_rotg[dtype: DType](iters: Int): + var a = generate_random_scalar[dtype](-100, 100) + var b = generate_random_scalar[dtype](-100, 100) + var c = Scalar[dtype](0) + var s = Scalar[dtype](0) + + for _ in range(WARMUP): + blas_rotg[dtype](UnsafePointer(to=a), UnsafePointer(to=b), UnsafePointer(to=c), UnsafePointer(to=s)) + + start = monotonic() + for _ in range(iters): + blas_rotg[dtype](UnsafePointer(to=a), UnsafePointer(to=b), UnsafePointer(to=c), UnsafePointer(to=s)) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + print("rotg, cpu,", dtype, ", -, ", iters, ",", Int(avg), "ns") + + +# TODO: uncomment once rotmg is added +# def bench_rotm[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): +# x_h = ctx.enqueue_create_host_buffer[dtype](n) +# y_h = ctx.enqueue_create_host_buffer[dtype](n) +# generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) +# generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) +# x_d = ctx.enqueue_create_buffer[dtype](n) +# y_d = ctx.enqueue_create_buffer[dtype](n) +# ctx.enqueue_copy(x_d, x_h) +# ctx.enqueue_copy(y_d, y_h) + +# # d1 and d2 must be positive +# var d1 = generate_random_scalar[dtype](1, 100) +# var d2 = generate_random_scalar[dtype](1, 100) +# var x1 = generate_random_scalar[dtype](-100, 100) +# var y1 = generate_random_scalar[dtype](-100, 100) +# param_h = ctx.enqueue_create_host_buffer[dtype](5) +# param_d = ctx.enqueue_create_buffer[dtype](5) +# # NOTE: need rotmg to compute a valid param +# ctx.enqueue_copy(param_d, param_h) +# ctx.synchronize() + +# for _ in range(WARMUP): +# blas_rotm[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, param_d.unsafe_ptr(), ctx) + +# start = monotonic() +# for _ in range(iters): +# blas_rotm[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, param_d.unsafe_ptr(), ctx) +# end = monotonic() + +# var avg = Float64(end - start) / Float64(iters) +# # bandwidth: 2n reads + 2n writes = 4n +# var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg +# print("rotm,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +# TODO: add bench_rotmg + + +def bench_scal[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + var alpha = Scalar[dtype](2.0) + + for _ in range(WARMUP): + blas_scal[dtype](n, alpha, x_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_scal[dtype](n, alpha, x_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + n writes = 2n + var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg + print("scal,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_swap[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_swap[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_swap[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + 2n writes = 4n + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("swap,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def run_dtype[ + dtype: DType +]( + routine: String, + params: RunParams, + ctx: DeviceContext +) where dtype.is_floating_point(): + for i in range(len(params.sizes)): + var n = params.sizes[i] + if (routine == "asum"): bench_asum[dtype](n, params.iters, ctx) + elif (routine == "axpy"): bench_axpy[dtype](n, params.iters, ctx) + elif (routine == "copy"): bench_copy[dtype](n, params.iters, ctx) + elif (routine == "dot"): bench_dot[dtype](n, params.iters, ctx) + elif (routine == "dotc"): bench_dotc[dtype](n, params.iters, ctx) + elif (routine == "dotu"): bench_dotu[dtype](n, params.iters, ctx) + elif (routine == "iamax"): bench_iamax[dtype](n, params.iters, ctx) + elif (routine == "nrm2"): bench_nrm2[dtype](n, params.iters, ctx) + elif (routine == "rot"): bench_rot[dtype](n, params.iters, ctx) + elif (routine == "rotg"): + bench_rotg[dtype](params.iters) + return + # elif (routine == "rotm"): bench_rotm[dtype](n, params.iters, ctx) + # elif (routine == "rotmg"): + # bench_rotmg[dtype](params.iters) # TODO: implement blas_rotmg + # return + elif (routine == "scal"): bench_scal[dtype](n, params.iters, ctx) + elif (routine == "swap"): bench_swap[dtype](n, params.iters, ctx) + else: + print("Unknown routine:", routine, "for", dtype) + return - print("copy mojo, ", ctx.name(), ", ", dtype.__str__(), ", ", n, ", ", iters, ", ", avg, ",", bw_gbs) def main(): if not has_accelerator(): print("No accelerator detected") return - print("op, backend, gpu, dtype, N, iters, avg time (nanoseconds)\n") - bench_copy[DType.float16](1048576, iters=1000) - bench_copy[DType.float16](8388608, iters=500) - bench_copy[DType.float16](16777216, iters=200) - print("\n") - - bench_copy[DType.float32](1048576, iters=1000) - bench_copy[DType.float32](8388608, iters=500) - bench_copy[DType.float32](16777216, iters=200) - print("\n") - - bench_copy[DType.float64](1048576, iters=1000) - bench_copy[DType.float64](8388608, iters=500) - bench_copy[DType.float64](16777216, iters=200) - print("\n") \ No newline at end of file + var params = RunParams() + if not parse_args(params): + return + + print("op, device, dtype, n, iters, avg time, bandwidth") + + with DeviceContext() as ctx: + for routine in(params.routines): + if params.dtype_str == "float32" or params.dtype_str == "all": + run_dtype[DType.float32](routine, params, ctx) + + if params.dtype_str == "float64" or params.dtype_str == "all": + run_dtype[DType.float64](routine, params, ctx) diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo index 5b97dea..2ee42f9 100644 --- a/src/testing_utils/testing_utils.mojo +++ b/src/testing_utils/testing_utils.mojo @@ -5,9 +5,9 @@ comptime tol32: Float32 = 1e-8 comptime tol64: Float64 = 1e-16 def generate_random_arr[ - dtype: DType, - size: Int + dtype: DType ]( + size: Int, a: UnsafePointer[Scalar[dtype], MutAnyOrigin], min_value: Scalar[dtype], max_value: Scalar[dtype] diff --git a/test-level1.mojo b/test-level1.mojo index 12aaf9a..1820efc 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -1,3 +1,4 @@ +from sys import argv from testing import assert_equal, assert_almost_equal, TestSuite from gpu.host import DeviceContext from math import sqrt @@ -19,7 +20,7 @@ def asum_test[ d_v = ctx.enqueue_create_buffer[dtype](size) v = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](v.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, v.unsafe_ptr(), -10000, 10000) ctx.enqueue_copy(d_v, v) d_res = ctx.enqueue_create_buffer[dtype](1) @@ -66,9 +67,9 @@ def axpy_test[ y = ctx.enqueue_create_host_buffer[dtype](size) mojo_res = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](1, UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, y.unsafe_ptr(), -10000, 10000) # print("a = ", a) # print("x = ", x) # print("y = ", y) @@ -119,8 +120,8 @@ def copy_test[ x = ctx.enqueue_create_host_buffer[dtype](size) y = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, y.unsafe_ptr(), -10000, 10000) # print("x = ", x) # print("y = ", y) @@ -157,8 +158,8 @@ def dot_test[ b = ctx.enqueue_create_host_buffer[dtype](size) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, size](a.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, size](b.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, a.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, b.unsafe_ptr(), -100, 100) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -217,8 +218,8 @@ def dot_test_complex[ b = ctx.enqueue_create_host_buffer[dtype](2*size) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, 2*size](a.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, 2*size](b.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](2*size, a.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](2*size, b.unsafe_ptr(), -100, 100) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -277,8 +278,8 @@ def dotc_test[ b = ctx.enqueue_create_host_buffer[dtype](size*2) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, size*2](a.unsafe_ptr(), -1, 1) - generate_random_arr[dtype, size*2](b.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](2*size, a.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](2*size, b.unsafe_ptr(), -1, 1) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -340,8 +341,8 @@ def dotu_test[ b = ctx.enqueue_create_host_buffer[dtype](size*2) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, size*2](a.unsafe_ptr(), -1, 1) - generate_random_arr[dtype, size*2](b.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](size*2, a.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](size*2, b.unsafe_ptr(), -1, 1) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -401,7 +402,7 @@ def iamax_test[ v = ctx.enqueue_create_host_buffer[dtype](size) # Generate an array of random numbers on CPU - generate_random_arr[dtype, size](v.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, v.unsafe_ptr(), -10000, 10000) # Copy random vector from CPU to GPU memory ctx.enqueue_copy(d_v, v) @@ -451,7 +452,7 @@ def nrm2_test[ d_x = ctx.enqueue_create_buffer[dtype](size) d_res = ctx.enqueue_create_buffer[dtype](1) - generate_random_arr[dtype, size](x.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -1000, 1000) ctx.enqueue_copy(d_x, x) d_res.enqueue_fill(-1) # set result to -1 for now @@ -500,8 +501,8 @@ def rot_test[ d_y = ctx.enqueue_create_buffer[dtype](size) y = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, size](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(d_x, x) ctx.enqueue_copy(d_y, y) @@ -607,8 +608,8 @@ def rotm_test[ # size_y = (n - 1) * abs(incy) + 1 x = ctx.enqueue_create_host_buffer[dtype](size) y = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, size](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, y.unsafe_ptr(), -100, 100) d_x = ctx.enqueue_create_buffer[dtype](size) d_y = ctx.enqueue_create_buffer[dtype](size) @@ -690,57 +691,57 @@ def rotm_test[ assert_almost_equal(y_result[i], expected_y, atol=atol) -def rotmg_test[ - dtype: DType, - size: Int -](): - with DeviceContext() as ctx: - # d1 and d2 must be positive - var d1 = generate_random_scalar[dtype](1, 10000) - var d2 = generate_random_scalar[dtype](1, 10000) - var x1 = generate_random_scalar[dtype](-10000, 10000) - var y1 = generate_random_scalar[dtype](-10000, 10000) - - d_d1 = ctx.enqueue_create_buffer[dtype](1) - d_d1.enqueue_fill(d1) - d_d2 = ctx.enqueue_create_buffer[dtype](1) - d_d2.enqueue_fill(d2) - d_x1 = ctx.enqueue_create_buffer[dtype](1) - d_x1.enqueue_fill(x1) - d_y1 = ctx.enqueue_create_buffer[dtype](1) - d_y1.enqueue_fill(y1) - d_param = ctx.enqueue_create_buffer[dtype](5) - - # Launch Mojo BLAS kernel - # NOTE: not implemented - # blas_rotmg[dtype]( - # d1.unsafe_ptr(), - # d2.unsafe_ptr(), - # x1.unsafe_ptr(), - # x2.unsafe_ptr(), - # d_param.unsafe_ptr(), - # ctx - # ) - - # Import SciPy and numpy - sp = Python.import_module("scipy") - np = Python.import_module("numpy") - sp_blas = sp.linalg.blas - - # srotmg - float32, drotmg - float64 - if dtype == DType.float32: - py_p = sp_blas.srotmg(d1, d2, x1, y1) - elif dtype == DType.float64: - py_p = sp_blas.drotmg(d1, d2, x1, y1) - else: - print(dtype , " is not supported by SciPy") - return - - # Only compare param - with d_param.map_to_host() as mojo_param: - for i in range(5): - var py_ref = Scalar[dtype](py=py_p[i]) - assert_equal(mojo_param[i], py_ref) +# def rotmg_test[ +# dtype: DType, +# size: Int +# ](): +# with DeviceContext() as ctx: +# # d1 and d2 must be positive +# var d1 = generate_random_scalar[dtype](1, 10000) +# var d2 = generate_random_scalar[dtype](1, 10000) +# var x1 = generate_random_scalar[dtype](-10000, 10000) +# var y1 = generate_random_scalar[dtype](-10000, 10000) + +# d_d1 = ctx.enqueue_create_buffer[dtype](1) +# d_d1.enqueue_fill(d1) +# d_d2 = ctx.enqueue_create_buffer[dtype](1) +# d_d2.enqueue_fill(d2) +# d_x1 = ctx.enqueue_create_buffer[dtype](1) +# d_x1.enqueue_fill(x1) +# d_y1 = ctx.enqueue_create_buffer[dtype](1) +# d_y1.enqueue_fill(y1) +# d_param = ctx.enqueue_create_buffer[dtype](5) + +# # Launch Mojo BLAS kernel +# # NOTE: not implemented +# # blas_rotmg[dtype]( +# # d1.unsafe_ptr(), +# # d2.unsafe_ptr(), +# # x1.unsafe_ptr(), +# # x2.unsafe_ptr(), +# # d_param.unsafe_ptr(), +# # ctx +# # ) + +# # Import SciPy and numpy +# sp = Python.import_module("scipy") +# np = Python.import_module("numpy") +# sp_blas = sp.linalg.blas + +# # srotmg - float32, drotmg - float64 +# if dtype == DType.float32: +# py_p = sp_blas.srotmg(d1, d2, x1, y1) +# elif dtype == DType.float64: +# py_p = sp_blas.drotmg(d1, d2, x1, y1) +# else: +# print(dtype , " is not supported by SciPy") +# return + +# # Only compare param +# with d_param.map_to_host() as mojo_param: +# for i in range(5): +# var py_ref = Scalar[dtype](py=py_p[i]) +# assert_equal(mojo_param[i], py_ref) def scal_test[ @@ -754,8 +755,8 @@ def scal_test[ x = ctx.enqueue_create_host_buffer[dtype](size) mojo_res = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](1, UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) # print("a = ", a) # print("x = ", x) @@ -802,8 +803,8 @@ def swap_test[ x2 = ctx.enqueue_create_host_buffer[dtype](size) y2 = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, y.unsafe_ptr(), -10000, 10000) d_x = ctx.enqueue_create_buffer[dtype](size) d_y = ctx.enqueue_create_buffer[dtype](size) @@ -892,11 +893,11 @@ def test_rotm(): rotm_test[DType.float64, 256]() rotm_test[DType.float64, 4096]() -def test_rotmg(): - rotmg_test[DType.float32, 256]() - rotmg_test[DType.float32, 4096]() - rotmg_test[DType.float64, 256]() - rotmg_test[DType.float64, 4096]() +# def test_rotmg(): +# rotmg_test[DType.float32, 256]() +# rotmg_test[DType.float32, 4096]() +# rotmg_test[DType.float64, 256]() +# rotmg_test[DType.float64, 4096]() def test_scal(): scal_test[DType.float32, 256]() @@ -912,4 +913,26 @@ def test_swap(): def main(): print("--- MojoBLAS Level 1 routines testing ---") - TestSuite.discover_tests[__functions_in_module()]().run() + var args = argv() + if (len(args) < 2): + TestSuite.discover_tests[__functions_in_module()]().run() + return + + var suite = TestSuite(cli_args=List[StaticString]()) + for i in range(1, len(args)): + if args[i] == "asum": suite.test[test_asum]() + elif args[i] == "axpy": suite.test[test_axpy]() + elif args[i] == "copy": suite.test[test_copy]() + elif args[i] == "dot": suite.test[test_dot]() + elif args[i] == "dotc": suite.test[test_dotc]() + elif args[i] == "dotu": suite.test[test_dotu]() + elif args[i] == "iamax": suite.test[test_iamax]() + elif args[i] == "nrm2": suite.test[test_nrm2]() + elif args[i] == "rot": suite.test[test_rot]() + elif args[i] == "rotg": suite.test[test_rotg]() + elif args[i] == "rotm": suite.test[test_rotm]() + # elif args[i] == "rotmg": suite.test[test_rotmg]() + elif args[i] == "scal": suite.test[test_scal]() + elif args[i] == "swap": suite.test[test_swap]() + else: print("unknown routine:", args[i]) + suite^.run() diff --git a/test-level2.mojo b/test-level2.mojo index a3a48f7..6ddd046 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -1,3 +1,4 @@ +from sys import argv from testing import assert_equal, assert_almost_equal, assert_true, TestSuite from gpu.host import DeviceContext @@ -25,9 +26,9 @@ def gemv_test[ y_d = ctx.enqueue_create_buffer[dtype](y_len) y = ctx.enqueue_create_host_buffer[dtype](y_len) - generate_random_arr[dtype, m * n](A.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, x_len](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, y_len](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](m * n, A.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](x_len, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](y_len, y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(A_d, A) ctx.enqueue_copy(x_d, x) @@ -112,9 +113,9 @@ def ger_test[ y = ctx.enqueue_create_host_buffer[dtype](n) # Generate three arrays of random numbers on CPU - generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](m * n, A.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](m, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](n, y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(A_device, A) ctx.enqueue_copy(x_device, x) @@ -188,4 +189,17 @@ def test_ger(): def main(): print("--- MojoBLAS Level 2 routines testing ---") - TestSuite.discover_tests[__functions_in_module()]().run() + var args = argv() + if (len(args) < 2): + TestSuite.discover_tests[__functions_in_module()]().run() + return + + var suite = TestSuite(cli_args=List[StaticString]()) + for i in range(1, len(args)): + if args[i] == "gemv": suite.test[test_gemv]() + elif args[i] == "ger": suite.test[test_ger]() + # elif args[i] == "syr": suite.test[test_syr]() + # elif args[i] == "syr2": suite.test[test_syr2]() + else: print("unknown routine:", args[i]) + suite^.run() +