From 933ff0bd52c22b4a8349f4e92f72824164d9ba06 Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 11:15:06 -0500 Subject: [PATCH 1/9] added option to run select tests --- test-level1.mojo | 25 ++++++++++++++++++++++++- test-level2.mojo | 14 +++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/test-level1.mojo b/test-level1.mojo index 12aaf9a..b1a40a8 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -1,3 +1,4 @@ +from sys import argv from testing import assert_equal, assert_almost_equal, TestSuite from gpu.host import DeviceContext from math import sqrt @@ -912,4 +913,26 @@ def test_swap(): def main(): print("--- MojoBLAS Level 1 routines testing ---") - TestSuite.discover_tests[__functions_in_module()]().run() + var args = argv() + if (len(args) < 2): + TestSuite.discover_tests[__functions_in_module()]().run() + return + + var suite = TestSuite(cli_args=List[StaticString]()) + for i in range(1, len(args)): + if args[i] == "asum": suite.test[test_asum]() + elif args[i] == "axpy": suite.test[test_axpy]() + elif args[i] == "copy": suite.test[test_copy]() + elif args[i] == "dot": suite.test[test_dot]() + elif args[i] == "dotc": suite.test[test_dotc]() + elif args[i] == "dotu": suite.test[test_dotu]() + elif args[i] == "iamax": suite.test[test_iamax]() + elif args[i] == "nrm2": suite.test[test_nrm2]() + elif args[i] == "rot": suite.test[test_rot]() + elif args[i] == "rotg": suite.test[test_rotg]() + elif args[i] == "rotm": suite.test[test_rotm]() + elif args[i] == "rotmg": suite.test[test_rotmg]() + elif args[i] == "scal": suite.test[test_scal]() + elif args[i] == "swap": suite.test[test_swap]() + else: print("unknown routine:", args[i]) + suite^.run() diff --git a/test-level2.mojo b/test-level2.mojo index a3a48f7..72e3fc6 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -1,3 +1,4 @@ +from sys import argv from testing import assert_equal, assert_almost_equal, assert_true, TestSuite from gpu.host import DeviceContext @@ -188,4 +189,15 @@ def test_ger(): def main(): print("--- MojoBLAS Level 2 routines testing ---") - TestSuite.discover_tests[__functions_in_module()]().run() + var args = argv() + if (len(args) < 2): + TestSuite.discover_tests[__functions_in_module()]().run() + return + + var suite = TestSuite(cli_args=List[StaticString]()) + for i in range(1, len(args)): + if args[i] == "gemv": suite.test[test_gemv]() + elif args[i] == "ger": suite.test[test_ger]() + else: print("unknown routine:", args[i]) + suite^.run() + From c7e7efcdc35e6418971b4c72973a9c8092e1afb7 Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 11:18:31 -0500 Subject: [PATCH 2/9] Added missing routines --- test-level2.mojo | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test-level2.mojo b/test-level2.mojo index 72e3fc6..b91c0b7 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -197,7 +197,9 @@ def main(): var suite = TestSuite(cli_args=List[StaticString]()) for i in range(1, len(args)): if args[i] == "gemv": suite.test[test_gemv]() - elif args[i] == "ger": suite.test[test_ger]() + elif args[i] == "ger": suite.test[test_ger]() + elif args[i] == "syr": suite.test[test_syr]() + elif args[i] == "syr2": suite.test[test_syr2]() else: print("unknown routine:", args[i]) suite^.run() From aef383ce38f88931964a35f1b12dd792bc73707d Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 11:37:26 -0500 Subject: [PATCH 3/9] size doesn't have to be a parameter in arr generator (more flexible benchmarks) --- src/testing_utils/testing_utils.mojo | 4 +- test-level1.mojo | 58 ++++++++++++++-------------- test-level2.mojo | 16 ++++---- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo index 5b97dea..2ee42f9 100644 --- a/src/testing_utils/testing_utils.mojo +++ b/src/testing_utils/testing_utils.mojo @@ -5,9 +5,9 @@ comptime tol32: Float32 = 1e-8 comptime tol64: Float64 = 1e-16 def generate_random_arr[ - dtype: DType, - size: Int + dtype: DType ]( + size: Int, a: UnsafePointer[Scalar[dtype], MutAnyOrigin], min_value: Scalar[dtype], max_value: Scalar[dtype] diff --git a/test-level1.mojo b/test-level1.mojo index b1a40a8..1fc6ffa 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -20,7 +20,7 @@ def asum_test[ d_v = ctx.enqueue_create_buffer[dtype](size) v = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](v.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, v.unsafe_ptr(), -10000, 10000) ctx.enqueue_copy(d_v, v) d_res = ctx.enqueue_create_buffer[dtype](1) @@ -67,9 +67,9 @@ def axpy_test[ y = ctx.enqueue_create_host_buffer[dtype](size) mojo_res = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](1, UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, y.unsafe_ptr(), -10000, 10000) # print("a = ", a) # print("x = ", x) # print("y = ", y) @@ -120,8 +120,8 @@ def copy_test[ x = ctx.enqueue_create_host_buffer[dtype](size) y = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, y.unsafe_ptr(), -10000, 10000) # print("x = ", x) # print("y = ", y) @@ -158,8 +158,8 @@ def dot_test[ b = ctx.enqueue_create_host_buffer[dtype](size) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, size](a.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, size](b.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, a.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, b.unsafe_ptr(), -100, 100) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -218,8 +218,8 @@ def dot_test_complex[ b = ctx.enqueue_create_host_buffer[dtype](2*size) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, 2*size](a.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, 2*size](b.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](2*size, a.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](2*size, b.unsafe_ptr(), -100, 100) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -278,8 +278,8 @@ def dotc_test[ b = ctx.enqueue_create_host_buffer[dtype](size*2) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, size*2](a.unsafe_ptr(), -1, 1) - generate_random_arr[dtype, size*2](b.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](2*size, a.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](2*size, b.unsafe_ptr(), -1, 1) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -341,8 +341,8 @@ def dotu_test[ b = ctx.enqueue_create_host_buffer[dtype](size*2) # Generate two arrays of random numbers on CPU - generate_random_arr[dtype, size*2](a.unsafe_ptr(), -1, 1) - generate_random_arr[dtype, size*2](b.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](size*2, a.unsafe_ptr(), -1, 1) + generate_random_arr[dtype](size*2, b.unsafe_ptr(), -1, 1) ctx.enqueue_copy(a_device, a) ctx.enqueue_copy(b_device, b) @@ -402,7 +402,7 @@ def iamax_test[ v = ctx.enqueue_create_host_buffer[dtype](size) # Generate an array of random numbers on CPU - generate_random_arr[dtype, size](v.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, v.unsafe_ptr(), -10000, 10000) # Copy random vector from CPU to GPU memory ctx.enqueue_copy(d_v, v) @@ -452,7 +452,7 @@ def nrm2_test[ d_x = ctx.enqueue_create_buffer[dtype](size) d_res = ctx.enqueue_create_buffer[dtype](1) - generate_random_arr[dtype, size](x.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -1000, 1000) ctx.enqueue_copy(d_x, x) d_res.enqueue_fill(-1) # set result to -1 for now @@ -501,8 +501,8 @@ def rot_test[ d_y = ctx.enqueue_create_buffer[dtype](size) y = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, size](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(d_x, x) ctx.enqueue_copy(d_y, y) @@ -608,8 +608,8 @@ def rotm_test[ # size_y = (n - 1) * abs(incy) + 1 x = ctx.enqueue_create_host_buffer[dtype](size) y = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, size](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](size, y.unsafe_ptr(), -100, 100) d_x = ctx.enqueue_create_buffer[dtype](size) d_y = ctx.enqueue_create_buffer[dtype](size) @@ -755,8 +755,8 @@ def scal_test[ x = ctx.enqueue_create_host_buffer[dtype](size) mojo_res = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](1, UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) # print("a = ", a) # print("x = ", x) @@ -803,8 +803,8 @@ def swap_test[ x2 = ctx.enqueue_create_host_buffer[dtype](size) y2 = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype](size, y.unsafe_ptr(), -10000, 10000) d_x = ctx.enqueue_create_buffer[dtype](size) d_y = ctx.enqueue_create_buffer[dtype](size) @@ -893,11 +893,11 @@ def test_rotm(): rotm_test[DType.float64, 256]() rotm_test[DType.float64, 4096]() -def test_rotmg(): - rotmg_test[DType.float32, 256]() - rotmg_test[DType.float32, 4096]() - rotmg_test[DType.float64, 256]() - rotmg_test[DType.float64, 4096]() +# def test_rotmg(): +# rotmg_test[DType.float32, 256]() +# rotmg_test[DType.float32, 4096]() +# rotmg_test[DType.float64, 256]() +# rotmg_test[DType.float64, 4096]() def test_scal(): scal_test[DType.float32, 256]() diff --git a/test-level2.mojo b/test-level2.mojo index b91c0b7..6ddd046 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -26,9 +26,9 @@ def gemv_test[ y_d = ctx.enqueue_create_buffer[dtype](y_len) y = ctx.enqueue_create_host_buffer[dtype](y_len) - generate_random_arr[dtype, m * n](A.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, x_len](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, y_len](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](m * n, A.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](x_len, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](y_len, y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(A_d, A) ctx.enqueue_copy(x_d, x) @@ -113,9 +113,9 @@ def ger_test[ y = ctx.enqueue_create_host_buffer[dtype](n) # Generate three arrays of random numbers on CPU - generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100) - generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](m * n, A.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](m, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](n, y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(A_device, A) ctx.enqueue_copy(x_device, x) @@ -198,8 +198,8 @@ def main(): for i in range(1, len(args)): if args[i] == "gemv": suite.test[test_gemv]() elif args[i] == "ger": suite.test[test_ger]() - elif args[i] == "syr": suite.test[test_syr]() - elif args[i] == "syr2": suite.test[test_syr2]() + # elif args[i] == "syr": suite.test[test_syr]() + # elif args[i] == "syr2": suite.test[test_syr2]() else: print("unknown routine:", args[i]) suite^.run() From 7fab6a33742029911dcff20c7f22dab7b540d22f Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 12:24:52 -0500 Subject: [PATCH 4/9] changes to copy benchmarks --- bench-level1.mojo | 160 +++++++++++++++++++++++++++++----------------- 1 file changed, 100 insertions(+), 60 deletions(-) diff --git a/bench-level1.mojo b/bench-level1.mojo index a2c1a24..53543f9 100644 --- a/bench-level1.mojo +++ b/bench-level1.mojo @@ -1,82 +1,122 @@ from gpu.host import DeviceContext -from sys import has_accelerator +from sys import has_accelerator, argv from time import monotonic - -# importing test wrappers from src import * -from random import rand, seed + +# Reference: https://github.com/icl-utk-edu/blaspp/blob/master/test/run_tests.py comptime WARMUP = 10 -# -def fill_random[dtype: DType]( - a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - n: Int -): - rand[dtype](a, n) def bytes_per_elem(dtype: DType) -> Int: - if dtype == DType.float16: - return 2 if dtype == DType.float32: return 4 if dtype == DType.float64: return 8 return 0 -# -def bench_copy[dtype: DType](n: Int, iters: Int): - with DeviceContext() as ctx: - x_h = ctx.enqueue_create_host_buffer[dtype](n) - y_h = ctx.enqueue_create_host_buffer[dtype](n) - - fill_random[dtype](x_h.unsafe_ptr(), n) - fill_random[dtype](y_h.unsafe_ptr(), n) - - x_d = ctx.enqueue_create_buffer[dtype](n) - y_d = ctx.enqueue_create_buffer[dtype](n) - - ctx.enqueue_copy(x_d, x_h) - ctx.enqueue_copy(y_d, y_h) - - for _ in range(WARMUP): - blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) - ctx.synchronize() - total: UInt = 0 - start = monotonic() - for _ in range (iters): - blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) - ctx.synchronize() - end = monotonic() - total += (end - start) +struct RunParams: + var routines: List[String] + var dtype_str: String + var sizes: List[Int] + var iters: Int + + fn __init__(out self): + self.routines = List[String]() + self.dtype_str = String("all") + self.sizes = List[Int]() + self.iters = 100 + + +def parse_args(mut params: RunParams) -> Bool: + var args = argv() + var n_custom = 0 + + var i = 1 + while i < len(args): + var arg = String(args[i]) + if arg == "--type": + if i + 1 < len(args): + params.dtype_str = String(args[i + 1]) + i += 2 + else: + print("--type requires a value") + return False + elif arg == "--n": + if i + 1 < len(args): + n_custom = Int(args[i + 1]) + i += 2 + else: + print("--n requires a value") + return False + elif arg == "--iters": + if i + 1 < len(args): + params.iters = Int(args[i + 1]) + i += 2 + else: + print("--iters requires a value") + return False + elif not arg.startswith("-"): + params.routines.append(arg) + i += 1 + else: + i += 1 + + if n_custom > 0: + params.sizes.append(n_custom) + else: + params.sizes.append(1024) + params.sizes.append(8192) + params.sizes.append(1048576) + params.sizes.append(8388608) + params.sizes.append(16777216) + + return True + + +def bench_copy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + + for _ in range(WARMUP): + blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg + + print("copy,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") - avg = total / iters - - var elem_bytes = bytes_per_elem(dtype) - var bytes_per_call: Float64 = Float64(2 * n * elem_bytes) - var avg_f: Float64 = Float64(avg) - var bw_gbs = bytes_per_call / avg_f - - print("copy mojo, ", ctx.name(), ", ", dtype.__str__(), ", ", n, ", ", iters, ", ", avg, ",", bw_gbs) def main(): if not has_accelerator(): print("No accelerator detected") return - print("op, backend, gpu, dtype, N, iters, avg time (nanoseconds)\n") - bench_copy[DType.float16](1048576, iters=1000) - bench_copy[DType.float16](8388608, iters=500) - bench_copy[DType.float16](16777216, iters=200) - print("\n") - - bench_copy[DType.float32](1048576, iters=1000) - bench_copy[DType.float32](8388608, iters=500) - bench_copy[DType.float32](16777216, iters=200) - print("\n") - - bench_copy[DType.float64](1048576, iters=1000) - bench_copy[DType.float64](8388608, iters=500) - bench_copy[DType.float64](16777216, iters=200) - print("\n") \ No newline at end of file + var params = RunParams() + if not parse_args(params): + return + + print("op, device, dtype, n, iters, avg time, bandwidth") + + with DeviceContext() as ctx: + for i in range(len(params.sizes)): + var n = params.sizes[i] + if params.dtype_str == "float32" or params.dtype_str == "all": + bench_copy[DType.float32](n, params.iters, ctx) + if params.dtype_str == "float64" or params.dtype_str == "all": + bench_copy[DType.float64](n, params.iters, ctx) From dd98ee6b336054b9ec39ac94f4fce420b98d9b75 Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 12:29:33 -0500 Subject: [PATCH 5/9] removed rotmg test --- test-level1.mojo | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test-level1.mojo b/test-level1.mojo index 1fc6ffa..b46b400 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -741,7 +741,7 @@ def rotmg_test[ with d_param.map_to_host() as mojo_param: for i in range(5): var py_ref = Scalar[dtype](py=py_p[i]) - assert_equal(mojo_param[i], py_ref) + # assert_equal(mojo_param[i], py_ref) def scal_test[ @@ -893,11 +893,11 @@ def test_rotm(): rotm_test[DType.float64, 256]() rotm_test[DType.float64, 4096]() -# def test_rotmg(): -# rotmg_test[DType.float32, 256]() -# rotmg_test[DType.float32, 4096]() -# rotmg_test[DType.float64, 256]() -# rotmg_test[DType.float64, 4096]() +def test_rotmg(): + rotmg_test[DType.float32, 256]() + rotmg_test[DType.float32, 4096]() + rotmg_test[DType.float64, 256]() + rotmg_test[DType.float64, 4096]() def test_scal(): scal_test[DType.float32, 256]() From e3338b1e8e8a8c894ad23662bcb9ae0b3fc1761f Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 14:55:29 -0500 Subject: [PATCH 6/9] completely removed rotmg test --- test-level1.mojo | 114 +++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/test-level1.mojo b/test-level1.mojo index b46b400..1820efc 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -691,57 +691,57 @@ def rotm_test[ assert_almost_equal(y_result[i], expected_y, atol=atol) -def rotmg_test[ - dtype: DType, - size: Int -](): - with DeviceContext() as ctx: - # d1 and d2 must be positive - var d1 = generate_random_scalar[dtype](1, 10000) - var d2 = generate_random_scalar[dtype](1, 10000) - var x1 = generate_random_scalar[dtype](-10000, 10000) - var y1 = generate_random_scalar[dtype](-10000, 10000) - - d_d1 = ctx.enqueue_create_buffer[dtype](1) - d_d1.enqueue_fill(d1) - d_d2 = ctx.enqueue_create_buffer[dtype](1) - d_d2.enqueue_fill(d2) - d_x1 = ctx.enqueue_create_buffer[dtype](1) - d_x1.enqueue_fill(x1) - d_y1 = ctx.enqueue_create_buffer[dtype](1) - d_y1.enqueue_fill(y1) - d_param = ctx.enqueue_create_buffer[dtype](5) - - # Launch Mojo BLAS kernel - # NOTE: not implemented - # blas_rotmg[dtype]( - # d1.unsafe_ptr(), - # d2.unsafe_ptr(), - # x1.unsafe_ptr(), - # x2.unsafe_ptr(), - # d_param.unsafe_ptr(), - # ctx - # ) - - # Import SciPy and numpy - sp = Python.import_module("scipy") - np = Python.import_module("numpy") - sp_blas = sp.linalg.blas - - # srotmg - float32, drotmg - float64 - if dtype == DType.float32: - py_p = sp_blas.srotmg(d1, d2, x1, y1) - elif dtype == DType.float64: - py_p = sp_blas.drotmg(d1, d2, x1, y1) - else: - print(dtype , " is not supported by SciPy") - return - - # Only compare param - with d_param.map_to_host() as mojo_param: - for i in range(5): - var py_ref = Scalar[dtype](py=py_p[i]) - # assert_equal(mojo_param[i], py_ref) +# def rotmg_test[ +# dtype: DType, +# size: Int +# ](): +# with DeviceContext() as ctx: +# # d1 and d2 must be positive +# var d1 = generate_random_scalar[dtype](1, 10000) +# var d2 = generate_random_scalar[dtype](1, 10000) +# var x1 = generate_random_scalar[dtype](-10000, 10000) +# var y1 = generate_random_scalar[dtype](-10000, 10000) + +# d_d1 = ctx.enqueue_create_buffer[dtype](1) +# d_d1.enqueue_fill(d1) +# d_d2 = ctx.enqueue_create_buffer[dtype](1) +# d_d2.enqueue_fill(d2) +# d_x1 = ctx.enqueue_create_buffer[dtype](1) +# d_x1.enqueue_fill(x1) +# d_y1 = ctx.enqueue_create_buffer[dtype](1) +# d_y1.enqueue_fill(y1) +# d_param = ctx.enqueue_create_buffer[dtype](5) + +# # Launch Mojo BLAS kernel +# # NOTE: not implemented +# # blas_rotmg[dtype]( +# # d1.unsafe_ptr(), +# # d2.unsafe_ptr(), +# # x1.unsafe_ptr(), +# # x2.unsafe_ptr(), +# # d_param.unsafe_ptr(), +# # ctx +# # ) + +# # Import SciPy and numpy +# sp = Python.import_module("scipy") +# np = Python.import_module("numpy") +# sp_blas = sp.linalg.blas + +# # srotmg - float32, drotmg - float64 +# if dtype == DType.float32: +# py_p = sp_blas.srotmg(d1, d2, x1, y1) +# elif dtype == DType.float64: +# py_p = sp_blas.drotmg(d1, d2, x1, y1) +# else: +# print(dtype , " is not supported by SciPy") +# return + +# # Only compare param +# with d_param.map_to_host() as mojo_param: +# for i in range(5): +# var py_ref = Scalar[dtype](py=py_p[i]) +# assert_equal(mojo_param[i], py_ref) def scal_test[ @@ -893,11 +893,11 @@ def test_rotm(): rotm_test[DType.float64, 256]() rotm_test[DType.float64, 4096]() -def test_rotmg(): - rotmg_test[DType.float32, 256]() - rotmg_test[DType.float32, 4096]() - rotmg_test[DType.float64, 256]() - rotmg_test[DType.float64, 4096]() +# def test_rotmg(): +# rotmg_test[DType.float32, 256]() +# rotmg_test[DType.float32, 4096]() +# rotmg_test[DType.float64, 256]() +# rotmg_test[DType.float64, 4096]() def test_scal(): scal_test[DType.float32, 256]() @@ -931,7 +931,7 @@ def main(): elif args[i] == "rot": suite.test[test_rot]() elif args[i] == "rotg": suite.test[test_rotg]() elif args[i] == "rotm": suite.test[test_rotm]() - elif args[i] == "rotmg": suite.test[test_rotmg]() + # elif args[i] == "rotmg": suite.test[test_rotmg]() elif args[i] == "scal": suite.test[test_scal]() elif args[i] == "swap": suite.test[test_swap]() else: print("unknown routine:", args[i]) From 99249166f1107086d9904665c02d9af33a979fce Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 15:13:17 -0500 Subject: [PATCH 7/9] retrigger ci From 398399d9f67d3bd27f5cecaa9022a96c36594079 Mon Sep 17 00:00:00 2001 From: tdehoff Date: Tue, 3 Mar 2026 15:18:19 -0500 Subject: [PATCH 8/9] retrigger ci again From 64816153580ae141c1408e7267655d6c63eda13c Mon Sep 17 00:00:00 2001 From: tdehoff Date: Wed, 4 Mar 2026 13:20:50 -0500 Subject: [PATCH 9/9] added remaining beenchmarks for level-1 routines --- bench-level1.mojo | 352 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 344 insertions(+), 8 deletions(-) diff --git a/bench-level1.mojo b/bench-level1.mojo index 53543f9..ed3372e 100644 --- a/bench-level1.mojo +++ b/bench-level1.mojo @@ -1,6 +1,7 @@ from gpu.host import DeviceContext from sys import has_accelerator, argv from time import monotonic +from math import sin, cos from src import * # Reference: https://github.com/icl-utk-edu/blaspp/blob/master/test/run_tests.py @@ -75,18 +76,65 @@ def parse_args(mut params: RunParams) -> Bool: return True -def bench_copy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): +def bench_asum[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): x_h = ctx.enqueue_create_host_buffer[dtype](n) - y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[dtype](1) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_asum[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_asum[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + var bw_gbs = Float64(n * bytes_per_elem(dtype)) / avg + print("asum,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_axpy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) - x_d = ctx.enqueue_create_buffer[dtype](n) y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + var alpha = Scalar[dtype](2.0) + + for _ in range(WARMUP): + blas_axpy[dtype](n, alpha, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_axpy[dtype](n, alpha, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + n writes = 3n + var bw_gbs = Float64(3 * n * bytes_per_elem(dtype)) / avg + print("axpy,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + +def bench_copy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) ctx.enqueue_copy(x_d, x_h) ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() for _ in range(WARMUP): blas_copy[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) @@ -97,11 +145,299 @@ def bench_copy[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): end = monotonic() var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + n writes = 2n var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg - print("copy,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") +def bench_dot[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[dtype](1) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_dot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_dot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg + print("dot,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_dotc[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + y_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + generate_random_arr[dtype](2 * n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](2 * n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](2 * n) + y_d = ctx.enqueue_create_buffer[dtype](2 * n) + res_d = ctx.enqueue_create_buffer[dtype](2) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_dotc[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_dotc[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2 vectors * 2n floats = 4n reads + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("dotc,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_dotu[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + y_h = ctx.enqueue_create_host_buffer[dtype](2 * n) + generate_random_arr[dtype](2 * n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](2 * n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](2 * n) + y_d = ctx.enqueue_create_buffer[dtype](2 * n) + res_d = ctx.enqueue_create_buffer[dtype](2) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_dotu[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_dotu[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2 vectors * 2n floats = 4n reads + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("dotu,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_iamax[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[DType.int64](1) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_iamax[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_iamax[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + var bw_gbs = Float64(n * bytes_per_elem(dtype)) / avg + print("iamax,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_nrm2[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + res_d = ctx.enqueue_create_buffer[dtype](1) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_nrm2[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + + start = monotonic() + for _ in range(iters): + blas_nrm2[dtype](n, x_d.unsafe_ptr(), 1, res_d.unsafe_ptr(), ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + var bw_gbs = Float64(n * bytes_per_elem(dtype)) / avg + print("nrm2,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_rot[dtype: DType](n: Int, iters: Int, ctx: DeviceContext) where dtype.is_floating_point(): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + var angle = generate_random_scalar[dtype](0, 2 * 3.14159265359) + var c = Scalar[dtype](cos(angle)) + var s = Scalar[dtype](sin(angle)) + + for _ in range(WARMUP): + blas_rot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, c, s, ctx) + + start = monotonic() + for _ in range(iters): + blas_rot[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, c, s, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + 2n writes = 4n + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("rot,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_rotg[dtype: DType](iters: Int): + var a = generate_random_scalar[dtype](-100, 100) + var b = generate_random_scalar[dtype](-100, 100) + var c = Scalar[dtype](0) + var s = Scalar[dtype](0) + + for _ in range(WARMUP): + blas_rotg[dtype](UnsafePointer(to=a), UnsafePointer(to=b), UnsafePointer(to=c), UnsafePointer(to=s)) + + start = monotonic() + for _ in range(iters): + blas_rotg[dtype](UnsafePointer(to=a), UnsafePointer(to=b), UnsafePointer(to=c), UnsafePointer(to=s)) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + print("rotg, cpu,", dtype, ", -, ", iters, ",", Int(avg), "ns") + + +# TODO: uncomment once rotmg is added +# def bench_rotm[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): +# x_h = ctx.enqueue_create_host_buffer[dtype](n) +# y_h = ctx.enqueue_create_host_buffer[dtype](n) +# generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) +# generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) +# x_d = ctx.enqueue_create_buffer[dtype](n) +# y_d = ctx.enqueue_create_buffer[dtype](n) +# ctx.enqueue_copy(x_d, x_h) +# ctx.enqueue_copy(y_d, y_h) + +# # d1 and d2 must be positive +# var d1 = generate_random_scalar[dtype](1, 100) +# var d2 = generate_random_scalar[dtype](1, 100) +# var x1 = generate_random_scalar[dtype](-100, 100) +# var y1 = generate_random_scalar[dtype](-100, 100) +# param_h = ctx.enqueue_create_host_buffer[dtype](5) +# param_d = ctx.enqueue_create_buffer[dtype](5) +# # NOTE: need rotmg to compute a valid param +# ctx.enqueue_copy(param_d, param_h) +# ctx.synchronize() + +# for _ in range(WARMUP): +# blas_rotm[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, param_d.unsafe_ptr(), ctx) + +# start = monotonic() +# for _ in range(iters): +# blas_rotm[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, param_d.unsafe_ptr(), ctx) +# end = monotonic() + +# var avg = Float64(end - start) / Float64(iters) +# # bandwidth: 2n reads + 2n writes = 4n +# var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg +# print("rotm,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +# TODO: add bench_rotmg + + +def bench_scal[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.synchronize() + + var alpha = Scalar[dtype](2.0) + + for _ in range(WARMUP): + blas_scal[dtype](n, alpha, x_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_scal[dtype](n, alpha, x_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: n reads + n writes = 2n + var bw_gbs = Float64(2 * n * bytes_per_elem(dtype)) / avg + print("scal,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def bench_swap[dtype: DType](n: Int, iters: Int, ctx: DeviceContext): + x_h = ctx.enqueue_create_host_buffer[dtype](n) + y_h = ctx.enqueue_create_host_buffer[dtype](n) + generate_random_arr[dtype](n, x_h.unsafe_ptr(), -1000, 1000) + generate_random_arr[dtype](n, y_h.unsafe_ptr(), -1000, 1000) + x_d = ctx.enqueue_create_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + ctx.enqueue_copy(x_d, x_h) + ctx.enqueue_copy(y_d, y_h) + ctx.synchronize() + + for _ in range(WARMUP): + blas_swap[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + + start = monotonic() + for _ in range(iters): + blas_swap[dtype](n, x_d.unsafe_ptr(), 1, y_d.unsafe_ptr(), 1, ctx) + end = monotonic() + + var avg = Float64(end - start) / Float64(iters) + # bandwidth: 2n reads + 2n writes = 4n + var bw_gbs = Float64(4 * n * bytes_per_elem(dtype)) / avg + print("swap,", ctx.name(), ",", dtype, ",", n, ",", iters, ",", Int(avg), "ns,", bw_gbs, "GB/s") + + +def run_dtype[ + dtype: DType +]( + routine: String, + params: RunParams, + ctx: DeviceContext +) where dtype.is_floating_point(): + for i in range(len(params.sizes)): + var n = params.sizes[i] + if (routine == "asum"): bench_asum[dtype](n, params.iters, ctx) + elif (routine == "axpy"): bench_axpy[dtype](n, params.iters, ctx) + elif (routine == "copy"): bench_copy[dtype](n, params.iters, ctx) + elif (routine == "dot"): bench_dot[dtype](n, params.iters, ctx) + elif (routine == "dotc"): bench_dotc[dtype](n, params.iters, ctx) + elif (routine == "dotu"): bench_dotu[dtype](n, params.iters, ctx) + elif (routine == "iamax"): bench_iamax[dtype](n, params.iters, ctx) + elif (routine == "nrm2"): bench_nrm2[dtype](n, params.iters, ctx) + elif (routine == "rot"): bench_rot[dtype](n, params.iters, ctx) + elif (routine == "rotg"): + bench_rotg[dtype](params.iters) + return + # elif (routine == "rotm"): bench_rotm[dtype](n, params.iters, ctx) + # elif (routine == "rotmg"): + # bench_rotmg[dtype](params.iters) # TODO: implement blas_rotmg + # return + elif (routine == "scal"): bench_scal[dtype](n, params.iters, ctx) + elif (routine == "swap"): bench_swap[dtype](n, params.iters, ctx) + else: + print("Unknown routine:", routine, "for", dtype) + return + + def main(): if not has_accelerator(): print("No accelerator detected") @@ -114,9 +450,9 @@ def main(): print("op, device, dtype, n, iters, avg time, bandwidth") with DeviceContext() as ctx: - for i in range(len(params.sizes)): - var n = params.sizes[i] + for routine in(params.routines): if params.dtype_str == "float32" or params.dtype_str == "all": - bench_copy[DType.float32](n, params.iters, ctx) + run_dtype[DType.float32](routine, params, ctx) + if params.dtype_str == "float64" or params.dtype_str == "all": - bench_copy[DType.float64](n, params.iters, ctx) + run_dtype[DType.float64](routine, params, ctx)