From 53d2ff3c279fac9c5c5521fcdddd8a5835e07a41 Mon Sep 17 00:00:00 2001 From: Jackson Mowry Date: Wed, 18 Mar 2026 18:36:45 +0000 Subject: [PATCH 1/2] sbmv implementation --- src/level2/__init__.mojo | 1 + src/level2/sbmv_device.mojo | 137 ++++++++++++++++++++++++ src/testing_utils/testing_utils.mojo | 49 +++++++++ test-level2.mojo | 153 +++++++++++++++++++++++++++ 4 files changed, 340 insertions(+) create mode 100644 src/level2/sbmv_device.mojo diff --git a/src/level2/__init__.mojo b/src/level2/__init__.mojo index 244b222..fa51daa 100644 --- a/src/level2/__init__.mojo +++ b/src/level2/__init__.mojo @@ -3,3 +3,4 @@ from .ger_device import * from .syr_device import * from .syr2_device import * from .gbmv_device import * +from .sbmv_device import * diff --git a/src/level2/sbmv_device.mojo b/src/level2/sbmv_device.mojo new file mode 100644 index 0000000..b40f33c --- /dev/null +++ b/src/level2/sbmv_device.mojo @@ -0,0 +1,137 @@ +from gpu import thread_idx, block_idx, block_dim, grid_dim +from gpu.host import DeviceContext +from math import ceildiv + +comptime TBsize = 512 + +# level2.sbmv +# Performs symmetric band matrix-vector multiplication +# y := alpha*A*x + beta*y, +# where A is an n by n symmetric band matrix with k off-diagonals. + +fn ssbmv_device( + uplo: Int, + n: Int, + k: Int, + alpha: Float32, + A: UnsafePointer[Float32, ImmutAnyOrigin], + lda: Int, + x: UnsafePointer[Float32, ImmutAnyOrigin], + incx: Int, + beta: Float32, + y: UnsafePointer[Float32, MutAnyOrigin], + incy: Int, +): + var global_i = block_dim.x * block_idx.x + thread_idx.x + var n_threads = grid_dim.x * block_dim.x + + for i in range(global_i, n, n_threads): + var sum = Scalar[DType.float32](0) + + var j_start = max(0, i - k) + var j_end = min(n - 1, i + k) + + for j in range(j_start, j_end + 1): + var val: Float32 + + if uplo: # upper + if j >= i: + val = A[i * lda + (j - i)] + else: + val = A[j * lda + (i - j)] + else: # lower + if j <= i: + val = A[i * lda + (i - j)] + else: + val = A[j * lda + (j - i)] + + sum += val * x[j * incx] + + y[i * incy] = alpha * sum + beta * y[i * incy] + + +fn dsbmv_device( + uplo: Int, + n: Int, + k: Int, + alpha: Float64, + A: UnsafePointer[Float64, ImmutAnyOrigin], + lda: Int, + x: UnsafePointer[Float64, ImmutAnyOrigin], + incx: Int, + beta: Float64, + y: UnsafePointer[Float64, MutAnyOrigin], + incy: Int, +): + var global_i = block_dim.x * block_idx.x + thread_idx.x + var n_threads = grid_dim.x * block_dim.x + + for i in range(global_i, n, n_threads): + var sum = Scalar[DType.float64](0) + + var j_start = max(0, i - k) + var j_end = min(n - 1, i + k) + + for j in range(j_start, j_end + 1): + var val: Float64 + + if uplo: # upper + if j >= i: + val = A[i * lda + (j - i)] + else: + val = A[j * lda + (i - j)] + else: # lower + if j <= i: + val = A[i * lda + (i - j)] + else: + val = A[j * lda + (j - i)] + + sum += val * x[j * incx] + + y[i * incy] = alpha * sum + beta * y[i * incy] + + +fn blas_sbmv[dtype: DType]( + uplo: Int, + n: Int, + k: Int, + alpha: Scalar[dtype], + d_A: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], + lda: Int, + d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], + incx: Int, + beta: Scalar[dtype], + d_y: UnsafePointer[Scalar[dtype], MutAnyOrigin], + incy: Int, + ctx: DeviceContext, +) raises: + + # TODO: + # check n > 0 + # check k >= 0 + # check lda >= k + 1 + # check incx, incy > 0 + + @parameter + if dtype == DType.float32: + ctx.enqueue_function[ssbmv_device, ssbmv_device]( + uplo, n, k, + alpha, d_A, lda, + d_x, incx, + beta, d_y, incy, + grid_dim=ceildiv(n, TBsize), + block_dim=TBsize, + ) + elif dtype == DType.float64: + ctx.enqueue_function[dsbmv_device, dsbmv_device]( + uplo, n, k, + alpha, d_A, lda, + d_x, incx, + beta, d_y, incy, + grid_dim=ceildiv(n, TBsize), + block_dim=TBsize, + ) + else: + raise Error("blas_sbmv: Unsupported type") + + ctx.synchronize() diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo index 184eccd..4a0e338 100644 --- a/src/testing_utils/testing_utils.mojo +++ b/src/testing_utils/testing_utils.mojo @@ -155,3 +155,52 @@ fn dense_to_band[dtype: DType]( B[i * band_width + band_col] = A[i * n + j] else: A[i * n + j] = Scalar[dtype](0) + +fn dense_to_sym_band[dtype: DType]( + A_dense: UnsafePointer[Scalar[dtype], MutAnyOrigin], + A_band: UnsafePointer[Scalar[dtype], MutAnyOrigin], + n: Int, + k: Int, + upper: Bool, +): + var lda = k + 1 + + # zero initialize + for i in range(n): + for b in range(lda): + A_band[i * lda + b] = 0 + + if upper: + # store A[i,j] for j >= i + for i in range(n): + var j_end = (i + k) if (i + k < n - 1) else (n - 1) + for j in range(i, j_end + 1): + var band_col = j - i + A_band[i * lda + band_col] = A_dense[i * n + j] + else: + # store A[i,j] for j <= i + for i in range(n): + var j_start = (i - k) if (i - k > 0) else 0 + for j in range(j_start, i + 1): + var band_col = i - j + A_band[i * lda + band_col] = A_dense[i * n + j] + + # Overwrite original matrix with band reconstruction + for i in range(n): + for j in range(n): + var val = Scalar[dtype](0) + + if upper: + if j >= i and j <= i + k: + val = A_band[i * lda + (j - i)] + elif i >= j and i <= j + k: + # symmetric mirror + val = A_band[j * lda + (i - j)] + else: + if j <= i and j >= i - k: + val = A_band[i * lda + (i - j)] + elif j >= i and j <= i + k: + # symmetric mirror + val = A_band[j * lda + (j - i)] + + A_dense[i * n + j] = val diff --git a/test-level2.mojo b/test-level2.mojo index acaa4fe..3f00fe7 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -428,6 +428,140 @@ def gbmv_test[ ) assert_true(ok) +def sbmv_test[ + dtype: DType, + n: Int, + k: Int, + upper: Bool, +](): + comptime lda = k + 1 + + with DeviceContext() as ctx: + # Dense symmetric host matrix (reference) + A_dense = ctx.enqueue_create_host_buffer[dtype](n * n) + + # Band storage (symmetric) + A_band = ctx.enqueue_create_host_buffer[dtype](lda * n) + A_d = ctx.enqueue_create_buffer[dtype](lda * n) + + x = ctx.enqueue_create_host_buffer[dtype](n) + x_d = ctx.enqueue_create_buffer[dtype](n) + + y = ctx.enqueue_create_host_buffer[dtype](n) + y_d = ctx.enqueue_create_buffer[dtype](n) + + # --- Generate symmetric dense matrix --- + generate_random_arr[dtype](n * n, A_dense.unsafe_ptr(), -1, 1) + + # Force symmetry: A = 0.5 * (A + A^T) + for i in range(n): + for j in range(i, n): + var aij = A_dense[i * n + j] + var aji = A_dense[j * n + i] + var sym = (aij + aji) * Scalar[dtype](0.5) + A_dense[i * n + j] = sym + A_dense[j * n + i] = sym + + generate_random_arr[dtype](n, x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype](n, y.unsafe_ptr(), -100, 100) + + # Convert dense -> symmetric band + dense_to_sym_band( + A_dense.unsafe_ptr(), + A_band.unsafe_ptr(), + n, + k, + upper + ) + + ctx.enqueue_copy(A_d, A_band) + ctx.enqueue_copy(x_d, x) + ctx.enqueue_copy(y_d, y) + ctx.synchronize() + + var alpha = generate_random_scalar[dtype](-100, 100) + var beta = generate_random_scalar[dtype](-100, 100) + + var norm_A = frobenius_norm[dtype](A_dense.unsafe_ptr(), n * n) + var norm_x = frobenius_norm[dtype](x.unsafe_ptr(), n) + var norm_y = frobenius_norm[dtype](y.unsafe_ptr(), n) + + blas_sbmv[dtype]( + 1 if upper else 0, + n, + k, + alpha, + A_d.unsafe_ptr(), lda, + x_d.unsafe_ptr(), 1, + beta, + y_d.unsafe_ptr(), 1, + ctx, + ) + + sp = Python.import_module("scipy") + np = Python.import_module("numpy") + sp_blas = sp.linalg.blas + + py_A = Python.list() + py_x = Python.list() + py_y = Python.list() + + for i in range(n * n): + py_A.append(A_dense[i]) + for i in range(n): + py_x.append(x[i]) + py_y.append(y[i]) + + var sp_res: PythonObject + + if dtype == DType.float32: + np_A = np.array(py_A, dtype=np.float32).reshape(n, n) + np_x = np.array(py_x, dtype=np.float32) + np_y = np.array(py_y, dtype=np.float32) + + sp_res = sp_blas.ssymv( + alpha, + np_A, + np_x, + beta=beta, + y=np_y, + lower=0 if upper else 1 + ) + + elif dtype == DType.float64: + np_A = np.array(py_A, dtype=np.float64).reshape(n, n) + np_x = np.array(py_x, dtype=np.float64) + np_y = np.array(py_y, dtype=np.float64) + + sp_res = sp_blas.dsymv( + alpha, + np_A, + np_x, + beta=beta, + y=np_y, + lower=0 if upper else 1 + ) + else: + print("Unsupported type") + return + + with y_d.map_to_host() as res_mojo: + var norm_diff = Scalar[dtype](0) + + for i in range(n): + var diff = res_mojo[i] - Scalar[dtype](py=sp_res[i]) + norm_diff += diff * diff + + norm_diff = sqrt(norm_diff) + + var ok = check_gemm_error[dtype]( + 1, n, n, + alpha, beta, + norm_A, norm_x, norm_y, + norm_diff + ) + assert_true(ok) + def test_gemv(): gemv_test[DType.float32, 64, 64, False]() gemv_test[DType.float32, 64, 64, True]() @@ -466,6 +600,24 @@ def test_gbmv(): gbmv_test[DType.float64, 512, 64, 1, 2, False]() gbmv_test[DType.float64, 512, 64, 2, 2, True]() +def test_sbmv(): + sbmv_test[DType.float32, 64, 1, False]() + sbmv_test[DType.float32, 64, 2, True]() + sbmv_test[DType.float32, 64, 16, False]() + sbmv_test[DType.float32, 64, 16, True]() + sbmv_test[DType.float64, 64, 1, False]() + sbmv_test[DType.float64, 64, 2, True]() + sbmv_test[DType.float64, 64, 16, False]() + sbmv_test[DType.float64, 64, 16, True]() + sbmv_test[DType.float32, 512, 1, False]() + sbmv_test[DType.float32, 512, 2, True]() + sbmv_test[DType.float32, 512, 32, False]() + sbmv_test[DType.float32, 512, 32, True]() + sbmv_test[DType.float64, 512, 1, False]() + sbmv_test[DType.float64, 512, 2, True]() + sbmv_test[DType.float64, 512, 32, False]() + sbmv_test[DType.float64, 512, 32, True]() + def main(): print("--- MojoBLAS Level 2 routines testing ---") var args = argv() @@ -479,6 +631,7 @@ def main(): elif args[i] == "ger": suite.test[test_ger]() # elif args[i] == "syr": suite.test[test_syr]() # elif args[i] == "syr2": suite.test[test_syr2]() + elif args[i] == "sbmv": suite.test[test_sbmv]() else: print("unknown routine:", args[i]) suite^.run() From a7212d7b76f40c3d06f3035c046cb9e79a0846a9 Mon Sep 17 00:00:00 2001 From: tdehoff Date: Fri, 20 Mar 2026 11:47:00 -0400 Subject: [PATCH 2/2] Added routine to produce col-major band storage for scipy --- src/testing_utils/testing_utils.mojo | 53 +++++++++++++++++++++++++++- test-level2.mojo | 41 +++++++++++++-------- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo index 775106a..9f0e5ef 100644 --- a/src/testing_utils/testing_utils.mojo +++ b/src/testing_utils/testing_utils.mojo @@ -156,7 +156,8 @@ fn dense_to_band[dtype: DType]( else: A[i * n + j] = Scalar[dtype](0) -fn dense_to_sym_band[dtype: DType]( +# Packs row-major dense matrix to row-major band buffer +fn dense_to_sym_band_rm[dtype: DType]( A_dense: UnsafePointer[Scalar[dtype], MutAnyOrigin], A_band: UnsafePointer[Scalar[dtype], MutAnyOrigin], n: Int, @@ -204,3 +205,53 @@ fn dense_to_sym_band[dtype: DType]( val = A_band[j * lda + (j - i)] A_dense[i * n + j] = val + +# Packs row-major dense matrix to column-major band buffer +fn dense_to_sym_band_cm[dtype: DType]( + A_dense: UnsafePointer[Scalar[dtype], MutAnyOrigin], + A_band: UnsafePointer[Scalar[dtype], MutAnyOrigin], + n: Int, + k: Int, + upper: Bool, +): + var lda = k + 1 + + # zero initialize + for i in range(n): + for b in range(lda): + A_band[i * lda + b] = 0 + + if upper: + # store A[i,j] for j >= i + for j in range(n): + var m = k - j + var i_start = j - k if (j - k > 0) else 0 + for i in range(i_start, j + 1): + A_band[j * lda + m + i] = A_dense[j * n + i] + else: + # store A[i,j] for j <= i + for j in range(n): + var i_end = (j + k) if (j + k < n - 1) else (n - 1) + for i in range(j, i_end + 1): + var band_col = i - j + A_band[j * lda + band_col] = A_dense[i * n + j] + + # Overwrite original matrix with band reconstruction + for i in range(n): + for j in range(n): + var val = Scalar[dtype](0) + + if upper: + if j >= i and j <= i + k: + val = A_band[(k - (j - i)) + j * lda] + elif i >= j and i <= j + k: + # symmetric mirror + val = A_band[(k - (i - j)) + i * lda] + else: + if j <= i and j >= i - k: + val = A_band[(i - j) + j * lda] + elif j >= i and j <= i + k: + # symmetric mirror + val = A_band[(j - i) + i * lda] + + A_dense[i * n + j] = val diff --git a/test-level2.mojo b/test-level2.mojo index 53b7ed8..ca94fd6 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -440,7 +440,8 @@ def sbmv_test[ A_dense = ctx.enqueue_create_host_buffer[dtype](n * n) # Band storage (symmetric) - A_band = ctx.enqueue_create_host_buffer[dtype](lda * n) + A_band_rm = ctx.enqueue_create_host_buffer[dtype](lda * n) + A_band_cm = ctx.enqueue_create_host_buffer[dtype](n * lda) A_d = ctx.enqueue_create_buffer[dtype](lda * n) x = ctx.enqueue_create_host_buffer[dtype](n) @@ -464,16 +465,25 @@ def sbmv_test[ generate_random_arr[dtype](n, x.unsafe_ptr(), -100, 100) generate_random_arr[dtype](n, y.unsafe_ptr(), -100, 100) - # Convert dense -> symmetric band - dense_to_sym_band( + # Convert dense -> symmetric row-major band for MojoBLAS routine + dense_to_sym_band_rm( A_dense.unsafe_ptr(), - A_band.unsafe_ptr(), + A_band_rm.unsafe_ptr(), n, k, upper ) - ctx.enqueue_copy(A_d, A_band) + # Convert dense -> symmetric col-major band for SciPy routine + dense_to_sym_band_cm( + A_dense.unsafe_ptr(), + A_band_cm.unsafe_ptr(), + n, + k, + upper + ) + + ctx.enqueue_copy(A_d, A_band_rm) ctx.enqueue_copy(x_d, x) ctx.enqueue_copy(y_d, y) ctx.synchronize() @@ -505,8 +515,8 @@ def sbmv_test[ py_x = Python.list() py_y = Python.list() - for i in range(n * n): - py_A.append(A_dense[i]) + for i in range(n * lda): + py_A.append(A_band_cm[i]) for i in range(n): py_x.append(x[i]) py_y.append(y[i]) @@ -514,11 +524,13 @@ def sbmv_test[ var sp_res: PythonObject if dtype == DType.float32: - np_A = np.array(py_A, dtype=np.float32).reshape(n, n) + # order='F' is col-major, 'C' is row-major + np_A = np.array(py_A, dtype=np.float32).reshape(lda, n, order='F') np_x = np.array(py_x, dtype=np.float32) np_y = np.array(py_y, dtype=np.float32) - sp_res = sp_blas.ssymv( + sp_res = sp_blas.ssbmv( + k, alpha, np_A, np_x, @@ -528,11 +540,12 @@ def sbmv_test[ ) elif dtype == DType.float64: - np_A = np.array(py_A, dtype=np.float64).reshape(n, n) + np_A = np.array(py_A, dtype=np.float64).reshape(lda, n, order='F') np_x = np.array(py_x, dtype=np.float64) np_y = np.array(py_y, dtype=np.float64) - sp_res = sp_blas.dsymv( + sp_res = sp_blas.dsbmv( + k, alpha, np_A, np_x, @@ -663,8 +676,8 @@ def trsv_test[ norm_A, norm_x, Scalar[dtype](0), norm_diff ) - assert_true(ok) - + assert_true(ok) + def test_gemv(): gemv_test[DType.float32, 64, 64, False]() gemv_test[DType.float32, 64, 64, True]() @@ -720,7 +733,7 @@ def test_sbmv(): sbmv_test[DType.float64, 512, 2, True]() sbmv_test[DType.float64, 512, 32, False]() sbmv_test[DType.float64, 512, 32, True]() - + def test_trsv(): trsv_test[DType.float32, 64, True, 0, 0]() trsv_test[DType.float32, 64, True, 1, 0]()