diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 87ef251..9169eb3 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -13,12 +13,24 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         build_type: [Release, Debug]
-        
+
     steps:
     - uses: actions/checkout@v3
 
+    - name: Install BLAS (Ubuntu)
+      if: runner.os == 'Linux'
+      run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
+
+    - name: Install OpenBLAS (Windows)
+      if: runner.os == 'Windows'
+      run: vcpkg install openblas:x64-windows
+
     - name: Configure CMake
-      run: cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_VERBOSE_MAKEFILE=ON
+      run: >
+        cmake -B build -S .
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DCMAKE_VERBOSE_MAKEFILE=ON
+        ${{ runner.os == 'Windows' && '-DCMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake' || '' }}
       
     - name: Build
       run: cmake --build build --config ${{ matrix.build_type }}
diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml
index 7fb13d3..86f3bbc 100644
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@@ -16,6 +16,10 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
+      - name: Install BLAS (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
+
       # Configure CMake with ASan + UBSan
       - name: Configure with sanitizers
         run: cmake -B build -S . \
diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml
index b381c8f..9eec318 100644
--- a/.github/workflows/valgrind.yml
+++ b/.github/workflows/valgrind.yml
@@ -11,7 +11,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y valgrind
+          sudo apt-get install -y valgrind libopenblas-dev
 
       - name: Configure Debug build
         run: cmake -B build -S . -DCMAKE_BUILD_TYPE=Debug
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3611755..077d6f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.15)
 project(DNLP_Diff_Engine C)
 set(CMAKE_C_STANDARD 99)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(DIFF_ENGINE_VERSION_MAJOR 0)
 set(DIFF_ENGINE_VERSION_MINOR 1)
@@ -53,6 +54,18 @@ if(NOT MSVC)
     target_link_libraries(dnlp_diff m)
 endif()
 
+# Find and link BLAS
+if(APPLE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    target_link_libraries(dnlp_diff ${ACCELERATE_FRAMEWORK})
+elseif(MSVC)
+    find_package(OpenBLAS CONFIG REQUIRED)
+    target_link_libraries(dnlp_diff OpenBLAS::OpenBLAS)
+else()
+    find_package(BLAS REQUIRED)
+    target_link_libraries(dnlp_diff ${BLAS_LIBRARIES})
+endif()
+
 # Config-specific compile options (compiler-specific)
 if(MSVC)
     target_compile_options(dnlp_diff PRIVATE
diff --git a/include/bivariate.h b/include/bivariate.h
index aa005ed..7260f33 100644
--- a/include/bivariate.h
+++ b/include/bivariate.h
@@ -30,12 +30,18 @@ expr *new_rel_entr_second_arg_scalar(expr *left, expr *right);
 /* Matrix multiplication: Z = X @ Y */
 expr *new_matmul(expr *x, expr *y);
 
-/* Left matrix multiplication: A @ f(x) where A is a constant matrix */
+/* Left matrix multiplication: A @ f(x) where A is a constant sparse matrix */
 expr *new_left_matmul(expr *u, const CSR_Matrix *A);
 
+/* Left matrix multiplication: A @ f(x) where A is a constant dense matrix
+ * (row-major, m x n). Uses CBLAS for efficient computation. */
+expr *new_left_matmul_dense(expr *u, int m, int n, const double *data);
+
 /* Right matrix multiplication: f(x) @ A where A is a constant matrix */
 expr *new_right_matmul(expr *u, const CSR_Matrix *A);
 
+expr *new_right_matmul_dense(expr *u, int m, int n, const double *data);
+
 /* Constant scalar multiplication: a * f(x) where a is a constant double */
 expr *new_const_scalar_mult(double a, expr *child);
 
diff --git a/include/subexpr.h b/include/subexpr.h
index b3f4170..b4b427c 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -21,6 +21,7 @@
 #include "expr.h"
 #include "utils/CSC_Matrix.h"
 #include "utils/CSR_Matrix.h"
+#include "utils/matrix.h"
 
 /* Forward declaration */
 struct int_double_pair;
@@ -107,12 +108,12 @@ important distinction compared to linear_op_expr. */
 typedef struct left_matmul_expr
 {
     expr base;
-    CSR_Matrix *A;
-    CSR_Matrix *AT;
+    Matrix *A;
+    Matrix *AT;
     int n_blocks;
     CSC_Matrix *Jchild_CSC;
     CSC_Matrix *J_CSC;
-    int *csc_to_csr_workspace;
+    int *csc_to_csr_work;
 } left_matmul_expr;
 
 /* Right matrix multiplication: y = f(x) * A where f(x) is an expression.
diff --git a/include/utils/cblas_wrapper.h b/include/utils/cblas_wrapper.h
new file mode 100644
index 0000000..79c648e
--- /dev/null
+++ b/include/utils/cblas_wrapper.h
@@ -0,0 +1,11 @@
+#ifndef CBLAS_WRAPPER_H
+#define CBLAS_WRAPPER_H
+
+#ifdef __APPLE__
+#define ACCELERATE_NEW_LAPACK
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
+
+#endif /* CBLAS_WRAPPER_H */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
new file mode 100644
index 0000000..fe7db5f
--- /dev/null
+++ b/include/utils/matrix.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MATRIX_H
+#define MATRIX_H
+
+#include "CSC_Matrix.h"
+#include "CSR_Matrix.h"
+
+/* Base matrix type with function pointers for polymorphic dispatch */
+typedef struct Matrix
+{
+    int m, n;
+    void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
+                                double *y, int p);
+    CSC_Matrix *(*block_left_mult_sparsity)(const struct Matrix *self,
+                                            const CSC_Matrix *J, int p);
+    void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
+                                   CSC_Matrix *C);
+    void (*free_fn)(struct Matrix *self);
+} Matrix;
+
+/* Sparse matrix wrapping CSR */
+typedef struct Sparse_Matrix
+{
+    Matrix base;
+    CSR_Matrix *csr;
+} Sparse_Matrix;
+
+/* Dense matrix (row-major) */
+typedef struct Dense_Matrix
+{
+    Matrix base;
+    double *x;
+    double *work; /* scratch buffer, length n */
+} Dense_Matrix;
+
+/* Constructors */
+Matrix *new_sparse_matrix(const CSR_Matrix *A);
+Matrix *new_dense_matrix(int m, int n, const double *data);
+
+/* Transpose helpers */
+Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
+Matrix *dense_matrix_trans(const Dense_Matrix *self);
+
+/* Free helper */
+static inline void free_matrix(Matrix *m)
+{
+    if (m)
+    {
+        m->free_fn(m);
+    }
+}
+
+#endif /* MATRIX_H */
diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c
index 31d8251..362d4f4 100644
--- a/src/bivariate/left_matmul.c
+++ b/src/bivariate/left_matmul.c
@@ -17,8 +17,7 @@
  */
 #include "bivariate.h"
 #include "subexpr.h"
-#include "utils/Timer.h"
-#include "utils/linalg_sparse_matmuls.h"
+#include "utils/matrix.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -45,9 +44,6 @@
 
     Working in terms of A_kron unifies the implementation of f(x) being
     vector-valued or matrix-valued.
-
-    I (dance858) think we can get additional big speedups when A is dense by
-    introducing a dense matrix class.
 */
 
 #include "utils/utils.h"
@@ -60,9 +56,9 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A_kron @ vec(f(x)) */
-    CSR_Matrix *A = ((left_matmul_expr *) node)->A;
+    Matrix *A = ((left_matmul_expr *) node)->A;
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
-    block_left_multiply_vec(A, x->value, node->value, n_blocks);
+    A->block_left_mult_vec(A, x->value, node->value, n_blocks);
 }
 
 static bool is_affine(const expr *node)
@@ -72,33 +68,32 @@ static bool is_affine(const expr *node)
 
 static void free_type_data(expr *node)
 {
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
-    free_csr_matrix(lin_node->A);
-    free_csr_matrix(lin_node->AT);
-    free_csc_matrix(lin_node->Jchild_CSC);
-    free_csc_matrix(lin_node->J_CSC);
-    free(lin_node->csc_to_csr_workspace);
-    lin_node->A = NULL;
-    lin_node->AT = NULL;
-    lin_node->Jchild_CSC = NULL;
-    lin_node->J_CSC = NULL;
-    lin_node->csc_to_csr_workspace = NULL;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+    free_matrix(lnode->A);
+    free_matrix(lnode->AT);
+    free_csc_matrix(lnode->Jchild_CSC);
+    free_csc_matrix(lnode->J_CSC);
+    free(lnode->csc_to_csr_work);
+    lnode->A = NULL;
+    lnode->AT = NULL;
+    lnode->Jchild_CSC = NULL;
+    lnode->J_CSC = NULL;
+    lnode->csc_to_csr_work = NULL;
 }
 
 static void jacobian_init(expr *node)
 {
     expr *x = node->left;
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
 
     /* initialize child's jacobian and precompute sparsity of its CSC */
     x->jacobian_init(x);
-    lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+    lnode->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
 
     /* precompute sparsity of this node's jacobian in CSC and CSR */
-    lin_node->J_CSC = block_left_multiply_fill_sparsity(
-        lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks);
-    node->jacobian =
-        csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace);
+    lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
+                                                      lnode->n_blocks);
+    node->jacobian = csc_to_csr_fill_sparsity(lnode->J_CSC, lnode->csc_to_csr_work);
 }
 
 static void eval_jacobian(expr *node)
@@ -114,8 +109,8 @@ static void eval_jacobian(expr *node)
     csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork);
 
     /* compute this node's jacobian: */
-    block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC);
-    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace);
+    lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
+    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_work);
 }
 
 static void wsum_hess_init(expr *node)
@@ -131,16 +126,16 @@ static void wsum_hess_init(expr *node)
 
     /* work for computing A^T w*/
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
-    int dim = ((left_matmul_expr *) node)->A->n * n_blocks;
+    int dim = ((left_matmul_expr *) node)->AT->m * n_blocks;
     node->dwork = (double *) malloc(dim * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
     /* compute A^T w*/
-    CSR_Matrix *AT = ((left_matmul_expr *) node)->AT;
+    Matrix *AT = ((left_matmul_expr *) node)->AT;
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
-    block_left_multiply_vec(AT, w, node->dwork, n_blocks);
+    AT->block_left_mult_vec(AT, w, node->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->dwork);
     memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
@@ -173,25 +168,64 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A)
     }
 
     /* Allocate the type-specific struct */
-    left_matmul_expr *lin_node =
+    left_matmul_expr *lnode =
         (left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
-    expr *node = &lin_node->base;
+    expr *node = &lnode->base;
     init_expr(node, d1, d2, u->n_vars, forward, jacobian_init, eval_jacobian,
               is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
     node->left = u;
     expr_retain(u);
 
-    /* allocate workspace. iwork is used for transposing A (requiring size A->n)
-       and for converting J_child csr to csc (requring size node->n_vars).
-       csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size)
-     */
+    /* allocate workspace. iwork is used for converting J_child csr to csc
+       (requiring size node->n_vars) and for transposing A (requiring size A->n).
+       csc_to_csr_work is used for converting J_CSC to CSR (requiring
+       node->size) */
     node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int));
-    lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
-    lin_node->n_blocks = n_blocks;
+    lnode->csc_to_csr_work = (int *) malloc(node->size * sizeof(int));
+    lnode->n_blocks = n_blocks;
 
     /* store A and AT */
-    lin_node->A = new_csr(A);
-    lin_node->AT = transpose(lin_node->A, node->iwork);
+    lnode->A = new_sparse_matrix(A);
+    lnode->AT = sparse_matrix_trans((const Sparse_Matrix *) lnode->A, node->iwork);
+
+    return node;
+}
+
+expr *new_left_matmul_dense(expr *u, int m, int n, const double *data)
+{
+    int d1, d2, n_blocks;
+    if (u->d1 == n)
+    {
+        d1 = m;
+        d2 = u->d2;
+        n_blocks = u->d2;
+    }
+    else if (u->d2 == n && u->d1 == 1)
+    {
+        d1 = 1;
+        d2 = m;
+        n_blocks = 1;
+    }
+    else
+    {
+        fprintf(stderr, "Error in new_left_matmul_dense: dimension mismatch\n");
+        exit(1);
+    }
+
+    left_matmul_expr *lnode =
+        (left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
+    expr *node = &lnode->base;
+    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
+    node->left = u;
+    expr_retain(u);
+
+    node->iwork = (int *) malloc(MAX(n, node->n_vars) * sizeof(int));
+    lnode->csc_to_csr_work = (int *) malloc(node->size * sizeof(int));
+    lnode->n_blocks = n_blocks;
+
+    lnode->A = new_dense_matrix(m, n, data);
+    lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
 
     return node;
 }
diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c
index 6ec593b..1ebff1f 100644
--- a/src/bivariate/right_matmul.c
+++ b/src/bivariate/right_matmul.c
@@ -17,9 +17,7 @@
  */
 #include "affine.h"
 #include "bivariate.h"
-#include "subexpr.h"
 #include "utils/CSR_Matrix.h"
-#include "utils/linalg_sparse_matmuls.h"
 #include <stdlib.h>
 
 /* This file implements the atom 'right_matmul' corresponding to the operation y =
@@ -41,3 +39,24 @@ expr *new_right_matmul(expr *u, const CSR_Matrix *A)
     free(work_transpose);
     return node;
 }
+
+expr *new_right_matmul_dense(expr *u, int m, int n, const double *data)
+{
+    /* We express: u @ A = (A^T @ u^T)^T
+       A is m x n, so A^T is n x m. */
+    double *AT = (double *) malloc(n * m * sizeof(double));
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            AT[j * m + i] = data[i * n + j];
+        }
+    }
+
+    expr *u_transpose = new_transpose(u);
+    expr *left_matmul_node = new_left_matmul_dense(u_transpose, n, m, AT);
+    expr *node = new_transpose(left_matmul_node);
+
+    free(AT);
+    return node;
+}
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
new file mode 100644
index 0000000..5400bb2
--- /dev/null
+++ b/src/utils/dense_matrix.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/cblas_wrapper.h"
+#include "utils/iVec.h"
+#include "utils/matrix.h"
+#include <stdlib.h>
+#include <string.h>
+
+static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *y,
+                                      int p)
+{
+    const Dense_Matrix *dm = (const Dense_Matrix *) A;
+    int m = dm->base.m;
+    int n = dm->base.n;
+
+    /* y = kron(I_p, A) @ x via a single dgemm call:
+       Treat x as n x p (column-major blocks) and y as m x p.
+       But x and y are stored as p blocks of length n and m
+       respectively (i.e. block-interleaved). This is the same as
+       treating them as row-major matrices of shape p x n and
+       p x m, so:
+       y (p x m) = x (p x n) * A^T (n x m), all row-major.
+       cblas with RowMajor: C = alpha * A * B + beta * C
+       where A = x (p x n), B = A^T (n x m), C = y (p x m). */
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, m, n, 1.0, x, n, dm->x,
+                n, 0.0, y, m);
+}
+
+static CSC_Matrix *dense_block_left_mult_sparsity(const Matrix *A,
+                                                  const CSC_Matrix *J, int p)
+{
+    int m = A->m;
+    int n = A->n;
+    int i, j, jj, block, block_start, block_end, block_jj_start, row_offset;
+
+    int *Cp = (int *) malloc((J->n + 1) * sizeof(int));
+    iVec *Ci = iVec_new(J->n * m);
+    Cp[0] = 0;
+
+    /* for each column of J */
+    for (j = 0; j < J->n; j++)
+    {
+        /* if empty we continue */
+        if (J->p[j] == J->p[j + 1])
+        {
+            Cp[j + 1] = Cp[j];
+            continue;
+        }
+
+        /* process each of p blocks of rows in this column of J */
+        jj = J->p[j];
+        for (block = 0; block < p; block++)
+        {
+            // -----------------------------------------------------------------
+            //  find start and end indices of rows of J in this block
+            // -----------------------------------------------------------------
+            block_start = block * n;
+            block_end = block_start + n;
+            while (jj < J->p[j + 1] && J->i[jj] < block_start)
+            {
+                jj++;
+            }
+
+            block_jj_start = jj;
+            while (jj < J->p[j + 1] && J->i[jj] < block_end)
+            {
+                jj++;
+            }
+
+            /* if no entries in this block, continue */
+            if (jj == block_jj_start)
+            {
+                continue;
+            }
+
+            /* dense A: all m rows contribute */
+            row_offset = block * m;
+            for (i = 0; i < m; i++)
+            {
+                iVec_append(Ci, row_offset + i);
+            }
+        }
+        Cp[j + 1] = Ci->len;
+    }
+
+    CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
+    memcpy(C->i, Ci->data, Ci->len * sizeof(int));
+    free(Cp);
+    iVec_free(Ci);
+
+    return C;
+}
+
+static void dense_block_left_mult_values(const Matrix *A, const CSC_Matrix *J,
+                                         CSC_Matrix *C)
+{
+    const Dense_Matrix *dm = (const Dense_Matrix *) A;
+    int m = dm->base.m;
+    int n = dm->base.n;
+    int k = J->n;
+
+    int i, j, s, block, block_start, block_end, start, end;
+
+    double *j_dense = dm->work;
+
+    /* for each column of J (and C) */
+    for (j = 0; j < k; j++)
+    {
+        for (i = C->p[j]; i < C->p[j + 1]; i += m)
+        {
+            block = C->i[i] / m;
+            block_start = block * n;
+            block_end = block_start + n;
+
+            start = J->p[j];
+            end = J->p[j + 1];
+
+            while (start < J->p[j + 1] && J->i[start] < block_start)
+            {
+                start++;
+            }
+
+            while (end > start && J->i[end - 1] >= block_end)
+            {
+                end--;
+            }
+
+            /* scatter sparse J col into dense vector and then compute A @ j_dense */
+            memset(j_dense, 0, n * sizeof(double));
+            for (s = start; s < end; s++)
+            {
+                j_dense[J->i[s] - block_start] = J->x[s];
+            }
+
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, dm->x, n, j_dense, 1,
+                        0.0, C->x + i, 1);
+        }
+    }
+}
+
+static void dense_free(Matrix *A)
+{
+    Dense_Matrix *dm = (Dense_Matrix *) A;
+    free(dm->x);
+    free(dm->work);
+    free(dm);
+}
+
+Matrix *new_dense_matrix(int m, int n, const double *data)
+{
+    Dense_Matrix *dm = (Dense_Matrix *) calloc(1, sizeof(Dense_Matrix));
+    dm->base.m = m;
+    dm->base.n = n;
+    dm->base.block_left_mult_vec = dense_block_left_mult_vec;
+    dm->base.block_left_mult_sparsity = dense_block_left_mult_sparsity;
+    dm->base.block_left_mult_values = dense_block_left_mult_values;
+    dm->base.free_fn = dense_free;
+    dm->x = (double *) malloc(m * n * sizeof(double));
+    memcpy(dm->x, data, m * n * sizeof(double));
+    dm->work = (double *) malloc(n * sizeof(double));
+    return &dm->base;
+}
+
+Matrix *dense_matrix_trans(const Dense_Matrix *A)
+{
+    int m = A->base.m;
+    int n = A->base.n;
+    double *AT_x = (double *) malloc(m * n * sizeof(double));
+
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            AT_x[j * m + i] = A->x[i * n + j];
+        }
+    }
+
+    Matrix *result = new_dense_matrix(n, m, AT_x);
+    free(AT_x);
+    return result;
+}
diff --git a/src/utils/linalg_sparse_matmuls.c b/src/utils/linalg_sparse_matmuls.c
index 82c03e0..9a7b164 100644
--- a/src/utils/linalg_sparse_matmuls.c
+++ b/src/utils/linalg_sparse_matmuls.c
@@ -109,16 +109,15 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
     /* A is m x n, J is (n*p) x k, C is (m*p) x k */
     int m = A->m;
     int n = A->n;
-    int k = J->n;
     int j, jj, block, block_start, block_end, block_jj_start, block_jj_end,
         row_offset;
 
     /* allocate column pointers and an estimate of row indices */
-    int *Cp = (int *) malloc((k + 1) * sizeof(int));
-    iVec *Ci = iVec_new(k * m);
+    int *Cp = (int *) malloc((J->n + 1) * sizeof(int));
+    iVec *Ci = iVec_new(J->n * m);
     Cp[0] = 0;
 
-    /* for each column of j */
+    /* for each column of J */
     for (j = 0; j < J->n; j++)
     {
         /* if empty we continue */
@@ -175,14 +174,9 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
         Cp[j + 1] = Ci->len;
     }
 
-    /* Allocate result matrix C in CSC format */
-    CSC_Matrix *C = new_csc_matrix(m * p, k, Ci->len);
-
-    /* Copy column pointers and row indices */
-    memcpy(C->p, Cp, (k + 1) * sizeof(int));
+    CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
-
-    /* Clean up workspace */
     free(Cp);
     iVec_free(Ci);
 
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
new file mode 100644
index 0000000..24ed539
--- /dev/null
+++ b/src/utils/sparse_matrix.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/linalg_sparse_matmuls.h"
+#include "utils/matrix.h"
+#include <stdlib.h>
+
+static void sparse_block_left_mult_vec(const Matrix *self, const double *x,
+                                       double *y, int p)
+{
+    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    block_left_multiply_vec(sm->csr, x, y, p);
+}
+
+static CSC_Matrix *sparse_block_left_mult_sparsity(const Matrix *self,
+                                                   const CSC_Matrix *J, int p)
+{
+    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    return block_left_multiply_fill_sparsity(sm->csr, J, p);
+}
+
+static void sparse_block_left_mult_values(const Matrix *self, const CSC_Matrix *J,
+                                          CSC_Matrix *C)
+{
+    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    block_left_multiply_fill_values(sm->csr, J, C);
+}
+
+static void sparse_free(Matrix *self)
+{
+    Sparse_Matrix *sm = (Sparse_Matrix *) self;
+    free_csr_matrix(sm->csr);
+    free(sm);
+}
+
+Matrix *new_sparse_matrix(const CSR_Matrix *A)
+{
+    Sparse_Matrix *sm = (Sparse_Matrix *) calloc(1, sizeof(Sparse_Matrix));
+    sm->base.m = A->m;
+    sm->base.n = A->n;
+    sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
+    sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
+    sm->base.block_left_mult_values = sparse_block_left_mult_values;
+    sm->base.free_fn = sparse_free;
+    sm->csr = new_csr(A);
+    return &sm->base;
+}
+
+Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork)
+{
+    CSR_Matrix *AT = transpose(self->csr, iwork);
+    Sparse_Matrix *sm = (Sparse_Matrix *) calloc(1, sizeof(Sparse_Matrix));
+    sm->base.m = AT->m;
+    sm->base.n = AT->n;
+    sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
+    sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
+    sm->base.block_left_mult_values = sparse_block_left_mult_values;
+    sm->base.free_fn = sparse_free;
+    sm->csr = AT;
+    return &sm->base;
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index fece8de..f9fe909 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -15,6 +15,7 @@
 #include "forward_pass/composite/test_composite.h"
 #include "forward_pass/elementwise/test_exp.h"
 #include "forward_pass/elementwise/test_log.h"
+#include "forward_pass/test_left_matmul_dense.h"
 #include "forward_pass/test_matmul.h"
 #include "forward_pass/test_prod_axis_one.h"
 #include "forward_pass/test_prod_axis_zero.h"
@@ -43,11 +44,13 @@
 #include "jacobian_tests/test_trace.h"
 #include "jacobian_tests/test_transpose.h"
 #include "problem/test_problem.h"
+#include "utils/test_cblas.h"
 #include "utils/test_coo_matrix.h"
 #include "utils/test_csc_matrix.h"
 #include "utils/test_csr_csc_conversion.h"
 #include "utils/test_csr_matrix.h"
 #include "utils/test_linalg_sparse_matmuls.h"
+#include "utils/test_matrix.h"
 #include "wsum_hess/elementwise/test_entr.h"
 #include "wsum_hess/elementwise/test_exp.h"
 #include "wsum_hess/elementwise/test_hyperbolic.h"
@@ -110,6 +113,7 @@ int main(void)
     mu_run_test(test_forward_prod_axis_zero, tests_run);
     mu_run_test(test_forward_prod_axis_one, tests_run);
     mu_run_test(test_matmul, tests_run);
+    mu_run_test(test_left_matmul_dense, tests_run);
 
     printf("\n--- Jacobian Tests ---\n");
     mu_run_test(test_neg_jacobian, tests_run);
@@ -242,6 +246,7 @@ int main(void)
     mu_run_test(test_wsum_hess_transpose, tests_run);
 
     printf("\n--- Utility Tests ---\n");
+    mu_run_test(test_cblas_ddot, tests_run);
     mu_run_test(test_diag_csr_mult, tests_run);
     mu_run_test(test_csr_sum, tests_run);
     mu_run_test(test_csr_sum2, tests_run);
@@ -275,6 +280,11 @@ int main(void)
     mu_run_test(test_csr_to_coo, tests_run);
     mu_run_test(test_csr_to_coo_lower_triangular, tests_run);
     mu_run_test(test_refresh_lower_triangular_coo, tests_run);
+    mu_run_test(test_dense_matrix_mult_vec, tests_run);
+    mu_run_test(test_dense_matrix_mult_vec_blocks, tests_run);
+    mu_run_test(test_sparse_vs_dense_mult_vec, tests_run);
+    mu_run_test(test_dense_matrix_trans, tests_run);
+    mu_run_test(test_sparse_vs_dense_mult_vec_blocks, tests_run);
 
     printf("\n--- Problem Struct Tests ---\n");
     mu_run_test(test_problem_new_free, tests_run);
diff --git a/tests/forward_pass/affine/test_add.h b/tests/forward_pass/affine/test_add.h
index 7ce859c..11fb35c 100644
--- a/tests/forward_pass/affine/test_add.h
+++ b/tests/forward_pass/affine/test_add.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_addition()
+const char *test_addition(void)
 {
     double u[2] = {3.0, 4.0};
     double c[2] = {1.0, 2.0};
diff --git a/tests/forward_pass/affine/test_broadcast.h b/tests/forward_pass/affine/test_broadcast.h
index ca030b5..e4adaab 100644
--- a/tests/forward_pass/affine/test_broadcast.h
+++ b/tests/forward_pass/affine/test_broadcast.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_broadcast_row()
+const char *test_broadcast_row(void)
 {
     /* Test broadcast row: (1, 3) -> (2, 3)
      * Input:  [1.0, 2.0, 3.0] (row vector)
@@ -30,7 +30,7 @@ const char *test_broadcast_row()
     return 0;
 }
 
-const char *test_broadcast_col()
+const char *test_broadcast_col(void)
 {
     /* Test broadcast column: (3, 1) -> (3, 2)
      * Input:  [[1.0],
@@ -56,7 +56,7 @@ const char *test_broadcast_col()
     return 0;
 }
 
-const char *test_broadcast_matrix()
+const char *test_broadcast_matrix(void)
 {
     /* Test no broadcast needed: (2, 3) -> (2, 3)
      * This should work when child shape already matches target
diff --git a/tests/forward_pass/affine/test_hstack.h b/tests/forward_pass/affine/test_hstack.h
index df76ba7..abf8c8e 100644
--- a/tests/forward_pass/affine/test_hstack.h
+++ b/tests/forward_pass/affine/test_hstack.h
@@ -8,7 +8,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_hstack_forward_vectors()
+const char *test_hstack_forward_vectors(void)
 {
     /* x is 3x1 variable with values [1, 2, 3] */
     double u[3] = {1.0, 2.0, 3.0};
@@ -33,7 +33,7 @@ const char *test_hstack_forward_vectors()
     return 0;
 }
 
-const char *test_hstack_forward_matrix()
+const char *test_hstack_forward_matrix(void)
 {
     /* x is 3x2 variable with values stored column-wise: [1, 3, 5, 2, 4, 6] */
     double u[6] = {1.0, 3.0, 5.0, 2.0, 4.0, 6.0};
diff --git a/tests/forward_pass/affine/test_linear_op.h b/tests/forward_pass/affine/test_linear_op.h
index 6034ea8..7cb2272 100644
--- a/tests/forward_pass/affine/test_linear_op.h
+++ b/tests/forward_pass/affine/test_linear_op.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_linear_op()
+const char *test_linear_op(void)
 {
     /* create CSR matrix
      A = [0 0 2 3 0 0]
diff --git a/tests/forward_pass/affine/test_sum.h b/tests/forward_pass/affine/test_sum.h
index 2e3d6ab..ae0bc37 100644
--- a/tests/forward_pass/affine/test_sum.h
+++ b/tests/forward_pass/affine/test_sum.h
@@ -8,7 +8,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_sum_axis_neg1()
+const char *test_sum_axis_neg1(void)
 {
     /* Create a 3x2 constant matrix stored column-wise:
        [1, 4]
@@ -33,7 +33,7 @@ const char *test_sum_axis_neg1()
     return 0;
 }
 
-const char *test_sum_axis_0()
+const char *test_sum_axis_0(void)
 {
     /* Create a 3x2 constant matrix stored column-wise:
        [1, 4]
@@ -60,7 +60,7 @@ const char *test_sum_axis_0()
     return 0;
 }
 
-const char *test_sum_axis_1()
+const char *test_sum_axis_1(void)
 {
     /* Create a 3x2 constant matrix stored column-wise:
        [1, 4]
diff --git a/tests/forward_pass/affine/test_variable_constant.h b/tests/forward_pass/affine/test_variable_constant.h
index c964654..9df0f1a 100644
--- a/tests/forward_pass/affine/test_variable_constant.h
+++ b/tests/forward_pass/affine/test_variable_constant.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_variable()
+const char *test_variable(void)
 {
     double u[3] = {1.0, 2.0, 3.0};
     expr *var = new_variable(3, 1, 0, 3);
@@ -17,7 +17,7 @@ const char *test_variable()
     return 0;
 }
 
-const char *test_constant()
+const char *test_constant(void)
 {
     double c[2] = {5.0, 10.0};
     double u[2] = {0.0, 0.0};
diff --git a/tests/forward_pass/composite/test_composite.h b/tests/forward_pass/composite/test_composite.h
index 92074a0..8839040 100644
--- a/tests/forward_pass/composite/test_composite.h
+++ b/tests/forward_pass/composite/test_composite.h
@@ -8,7 +8,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_composite()
+const char *test_composite(void)
 {
     double u[2] = {1.0, 2.0};
     double c[2] = {1.0, 1.0};
diff --git a/tests/forward_pass/elementwise/test_exp.h b/tests/forward_pass/elementwise/test_exp.h
index a153889..88a4af9 100644
--- a/tests/forward_pass/elementwise/test_exp.h
+++ b/tests/forward_pass/elementwise/test_exp.h
@@ -8,7 +8,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_exp()
+const char *test_exp(void)
 {
     double u[2] = {0.0, 1.0};
     expr *var = new_variable(2, 1, 0, 2);
diff --git a/tests/forward_pass/elementwise/test_log.h b/tests/forward_pass/elementwise/test_log.h
index d21dbf0..5287803 100644
--- a/tests/forward_pass/elementwise/test_log.h
+++ b/tests/forward_pass/elementwise/test_log.h
@@ -8,7 +8,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_log()
+const char *test_log(void)
 {
     double u[2] = {1.0, 2.718281828};
     expr *var = new_variable(2, 1, 0, 2);
diff --git a/tests/forward_pass/test_left_matmul_dense.h b/tests/forward_pass/test_left_matmul_dense.h
new file mode 100644
index 0000000..a953806
--- /dev/null
+++ b/tests/forward_pass/test_left_matmul_dense.h
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "bivariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_left_matmul_dense(void)
+{
+    /* Test: Z = A @ X where
+     * A is 3x3 (row-major): [1 2 3; 4 5 6; 7 8 9]
+     * X is 3x3 variable (col-major): [1 4 7; 2 5 8; 3 6 9]
+     *
+     * Z = A @ X = [14   32   50]
+     *             [32   77  122]
+     *             [50  122  194]
+     */
+
+    /* Create X variable (3 x 3) */
+    expr *X = new_variable(3, 3, 0, 9);
+
+    /* Constant matrix A in row-major order */
+    double A_data[9] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+
+    /* Build expression Z = A @ X */
+    expr *Z = new_left_matmul_dense(X, 3, 3, A_data);
+
+    /* Variable values in column-major order */
+    double u[9] = {1.0, 2.0, 3.0,  /* first column */
+                   4.0, 5.0, 6.0,  /* second column */
+                   7.0, 8.0, 9.0}; /* third column */
+
+    /* Evaluate forward pass */
+    Z->forward(Z, u);
+
+    /* Expected result (3 x 3) in column-major order */
+    double expected[9] = {14.0, 32.0,  50.0,   /* first column */
+                          32.0, 77.0,  122.0,  /* second column */
+                          50.0, 122.0, 194.0}; /* third column */
+
+    /* Verify dimensions */
+    mu_assert("left_matmul_dense result should have d1=3", Z->d1 == 3);
+    mu_assert("left_matmul_dense result should have d2=3", Z->d2 == 3);
+    mu_assert("left_matmul_dense result should have size=9", Z->size == 9);
+
+    /* Verify values */
+    mu_assert("Left matmul dense forward pass test failed",
+              cmp_double_array(Z->value, expected, 9));
+
+    free_expr(Z);
+    return 0;
+}
diff --git a/tests/forward_pass/test_matmul.h b/tests/forward_pass/test_matmul.h
index b2731d1..be53542 100644
--- a/tests/forward_pass/test_matmul.h
+++ b/tests/forward_pass/test_matmul.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_matmul()
+const char *test_matmul(void)
 {
     /* Test: Z = X @ Y where
      * X is 3x2:  [1  4]     Y is 2x4:  [7   9   11  13]
diff --git a/tests/forward_pass/test_prod_axis_one.h b/tests/forward_pass/test_prod_axis_one.h
index 6f4b3bb..7cf74e0 100644
--- a/tests/forward_pass/test_prod_axis_one.h
+++ b/tests/forward_pass/test_prod_axis_one.h
@@ -8,7 +8,7 @@
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_forward_prod_axis_one()
+const char *test_forward_prod_axis_one(void)
 {
     /* Create a 2x3 constant matrix stored column-wise:
        [1, 3, 5]
diff --git a/tests/forward_pass/test_prod_axis_zero.h b/tests/forward_pass/test_prod_axis_zero.h
index 6504502..f87782a 100644
--- a/tests/forward_pass/test_prod_axis_zero.h
+++ b/tests/forward_pass/test_prod_axis_zero.h
@@ -8,7 +8,7 @@
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_forward_prod_axis_zero()
+const char *test_forward_prod_axis_zero(void)
 {
     /* Create a 2x3 constant matrix stored column-wise:
        [1, 3, 5]
diff --git a/tests/jacobian_tests/test_broadcast.h b/tests/jacobian_tests/test_broadcast.h
index 612e5cf..e6ffa4f 100644
--- a/tests/jacobian_tests/test_broadcast.h
+++ b/tests/jacobian_tests/test_broadcast.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_broadcast_row_jacobian()
+const char *test_broadcast_row_jacobian(void)
 {
     /* Test jacobian of broadcast row: (1, 3) -> (2, 3)
      * Input variable: 3 elements (d1=1, d2=3)
diff --git a/tests/jacobian_tests/test_composite.h b/tests/jacobian_tests/test_composite.h
index 0a52cc9..b001f1c 100644
--- a/tests/jacobian_tests/test_composite.h
+++ b/tests/jacobian_tests/test_composite.h
@@ -5,7 +5,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_composite_log()
+const char *test_jacobian_composite_log(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
@@ -45,7 +45,7 @@ const char *test_jacobian_composite_log()
        0 0 0 0 0 2 2
        0 0 0 0 0 3 3]
 */
-const char *test_jacobian_composite_log_add()
+const char *test_jacobian_composite_log_add(void)
 {
     double u_vals[7] = {0, 0, 1, 1, 1, 2, 2};
 
diff --git a/tests/jacobian_tests/test_const_scalar_mult.h b/tests/jacobian_tests/test_const_scalar_mult.h
index 3143929..55aca74 100644
--- a/tests/jacobian_tests/test_const_scalar_mult.h
+++ b/tests/jacobian_tests/test_const_scalar_mult.h
@@ -8,7 +8,7 @@
 
 /* Test: y = a * log(x) where a is a scalar constant */
 
-const char *test_jacobian_const_scalar_mult_log_vector()
+const char *test_jacobian_const_scalar_mult_log_vector(void)
 {
     /* Create variable x: [1.0, 2.0, 4.0] with 3 elements */
     double u_vals[3] = {1.0, 2.0, 4.0};
@@ -44,7 +44,7 @@ const char *test_jacobian_const_scalar_mult_log_vector()
     return 0;
 }
 
-const char *test_jacobian_const_scalar_mult_log_matrix()
+const char *test_jacobian_const_scalar_mult_log_matrix(void)
 {
     /* Create variable x as 2x2 matrix: [[1.0, 2.0], [4.0, 8.0]] */
     double u_vals[4] = {1.0, 2.0, 4.0, 8.0};
diff --git a/tests/jacobian_tests/test_const_vector_mult.h b/tests/jacobian_tests/test_const_vector_mult.h
index 4658dd4..8688433 100644
--- a/tests/jacobian_tests/test_const_vector_mult.h
+++ b/tests/jacobian_tests/test_const_vector_mult.h
@@ -8,7 +8,7 @@
 
 /* Test: y = a ∘ log(x) where a is a constant vector */
 
-const char *test_jacobian_const_vector_mult_log_vector()
+const char *test_jacobian_const_vector_mult_log_vector(void)
 {
     /* Create variable x: [1.0, 2.0, 4.0] */
     double u_vals[3] = {1.0, 2.0, 4.0};
@@ -48,7 +48,7 @@ const char *test_jacobian_const_vector_mult_log_vector()
     return 0;
 }
 
-const char *test_jacobian_const_vector_mult_log_matrix()
+const char *test_jacobian_const_vector_mult_log_matrix(void)
 {
     /* Create variable x as 2x2 matrix: [[1.0, 2.0], [4.0, 8.0]] */
     double u_vals[4] = {1.0, 2.0, 4.0, 8.0};
diff --git a/tests/jacobian_tests/test_elementwise_mult.h b/tests/jacobian_tests/test_elementwise_mult.h
index 5f10bf3..bad278b 100644
--- a/tests/jacobian_tests/test_elementwise_mult.h
+++ b/tests/jacobian_tests/test_elementwise_mult.h
@@ -6,7 +6,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_jacobian_elementwise_mult_1()
+const char *test_jacobian_elementwise_mult_1(void)
 {
     // var = (z, x, w, y) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 3 x 1
     // we compute jacobian of x * y (elementwise multiplication)
@@ -31,7 +31,7 @@ const char *test_jacobian_elementwise_mult_1()
     return 0;
 }
 
-const char *test_jacobian_elementwise_mult_2()
+const char *test_jacobian_elementwise_mult_2(void)
 {
     // var = (z, y, w, x) where z is 2 x 1, y is 3 x 1, w is 2 x 1, x is 3 x 1
     // we compute jacobian of x * y (elementwise multiplication)
@@ -56,7 +56,7 @@ const char *test_jacobian_elementwise_mult_2()
     return 0;
 }
 
-const char *test_jacobian_elementwise_mult_3()
+const char *test_jacobian_elementwise_mult_3(void)
 {
     // var = (z, x, w, y) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 3 x 1
     // we compute jacobian of Ax * By (elementwise multiplication)
@@ -121,7 +121,7 @@ const char *test_jacobian_elementwise_mult_3()
     return 0;
 }
 
-const char *test_jacobian_elementwise_mult_4()
+const char *test_jacobian_elementwise_mult_4(void)
 {
     // var = (z, x, w, y) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 3 x 1
     // we compute jacobian of Ax * Ax (elementwise multiplication)
diff --git a/tests/jacobian_tests/test_hstack.h b/tests/jacobian_tests/test_hstack.h
index 67ed85e..52f7008 100644
--- a/tests/jacobian_tests/test_hstack.h
+++ b/tests/jacobian_tests/test_hstack.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_hstack_vectors()
+const char *test_jacobian_hstack_vectors(void)
 {
     /* Test Jacobian of hstack(log(x), exp(x), sin(x)) where x is 3x1
      * x = [1, 2, 3] at global indices [0, 1, 2]
@@ -49,7 +49,7 @@ const char *test_jacobian_hstack_vectors()
     return 0;
 }
 
-const char *test_jacobian_hstack_matrix()
+const char *test_jacobian_hstack_matrix(void)
 {
     /* Test Jacobian of hstack(log(x), exp(x), sin(x)) where x is 3x2
      * x stored column-wise: [1, 3, 5, 2, 4, 6]
diff --git a/tests/jacobian_tests/test_left_matmul.h b/tests/jacobian_tests/test_left_matmul.h
index f79eee7..480679a 100644
--- a/tests/jacobian_tests/test_left_matmul.h
+++ b/tests/jacobian_tests/test_left_matmul.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_left_matmul_log()
+const char *test_jacobian_left_matmul_log(void)
 {
     /* Test Jacobian of A @ log(x) where:
      * x is 3x1 variable at x = [1, 2, 3]
@@ -68,7 +68,7 @@ const char *test_jacobian_left_matmul_log()
     return 0;
 }
 
-const char *test_jacobian_left_matmul_log_matrix()
+const char *test_jacobian_left_matmul_log_matrix(void)
 {
     /* x is 3x2, vectorized column-wise: [1,2,3 | 4,5,6] */
     double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
@@ -107,7 +107,7 @@ const char *test_jacobian_left_matmul_log_matrix()
     return 0;
 }
 
-const char *test_jacobian_left_matmul_log_composite()
+const char *test_jacobian_left_matmul_log_composite(void)
 {
     /* Test Jacobian of A @ log(B @ x) where:
      * x is 3x1 variable at x = [1, 2, 3]
diff --git a/tests/jacobian_tests/test_log.h b/tests/jacobian_tests/test_log.h
index f17c290..34d9c4a 100644
--- a/tests/jacobian_tests/test_log.h
+++ b/tests/jacobian_tests/test_log.h
@@ -6,7 +6,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_log()
+const char *test_jacobian_log(void)
 {
     double u_vals[5] = {0.0, 0.0, 1.0, 2.0, 3.0};
     double expected_Ax[3] = {1.0, 0.5, 0.333333333};
@@ -24,7 +24,7 @@ const char *test_jacobian_log()
     return 0;
 }
 
-const char *test_jacobian_log_matrix()
+const char *test_jacobian_log_matrix(void)
 {
     double u_vals[7] = {0.0, 0.0, 0.0, 1.0, 2.0, 4.0, 5.0};
     double expected_Ax[4] = {1.0, 0.5, 0.25, 0.2};
diff --git a/tests/jacobian_tests/test_matmul.h b/tests/jacobian_tests/test_matmul.h
index 16a23fc..8d673cc 100644
--- a/tests/jacobian_tests/test_matmul.h
+++ b/tests/jacobian_tests/test_matmul.h
@@ -5,7 +5,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_jacobian_matmul()
+const char *test_jacobian_matmul(void)
 {
     /* Test: Z = X @ Y where X is 2x3, Y is 3x4
      * var = (X, Y) where X starts at index 0 (size 6), Y starts at index 6 (size 12)
diff --git a/tests/jacobian_tests/test_prod.h b/tests/jacobian_tests/test_prod.h
index 8fbfe99..95e8d86 100644
--- a/tests/jacobian_tests/test_prod.h
+++ b/tests/jacobian_tests/test_prod.h
@@ -1,11 +1,12 @@
 #include <stdio.h>
 
+#include "affine.h"
 #include "expr.h"
 #include "minunit.h"
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_prod_no_zero()
+const char *test_jacobian_prod_no_zero(void)
 {
     /* x is 4x1 variable, global index 2, total 8 vars
      * x = [1, 2, 3, 4]
@@ -32,7 +33,7 @@ const char *test_jacobian_prod_no_zero()
     return 0;
 }
 
-const char *test_jacobian_prod_one_zero()
+const char *test_jacobian_prod_one_zero(void)
 {
     /* x = [1, 0, 3, 4], zero at index 1
      * df/dx = [0, prod_nonzero, 0, 0] = [0, 12, 0, 0]
@@ -57,7 +58,7 @@ const char *test_jacobian_prod_one_zero()
     return 0;
 }
 
-const char *test_jacobian_prod_two_zeros()
+const char *test_jacobian_prod_two_zeros(void)
 {
     /* x = [1, 0, 0, 4], two zeros -> Jacobian all zeros */
     double u_vals[8] = {0.0, 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0};
diff --git a/tests/jacobian_tests/test_prod_axis_one.h b/tests/jacobian_tests/test_prod_axis_one.h
index 77993fb..116e110 100644
--- a/tests/jacobian_tests/test_prod_axis_one.h
+++ b/tests/jacobian_tests/test_prod_axis_one.h
@@ -6,7 +6,7 @@
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_prod_axis_one()
+const char *test_jacobian_prod_axis_one(void)
 {
     /* x is 3x3 variable, global index 1, total 10 vars
      * x = [1, 2, 3, 4, 5, 6, 7, 8, 9] (column-major order)
@@ -50,7 +50,7 @@ const char *test_jacobian_prod_axis_one()
     return 0;
 }
 
-const char *test_jacobian_prod_axis_one_one_zero()
+const char *test_jacobian_prod_axis_one_one_zero(void)
 {
     /* x is 3x3 variable, global index 1, total 10 vars
      * x = [1, 2, 3, 4, 0, 6, 7, 8, 9] (column-major order)
diff --git a/tests/jacobian_tests/test_prod_axis_zero.h b/tests/jacobian_tests/test_prod_axis_zero.h
index 7b3179e..f9a8958 100644
--- a/tests/jacobian_tests/test_prod_axis_zero.h
+++ b/tests/jacobian_tests/test_prod_axis_zero.h
@@ -6,7 +6,7 @@
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_prod_axis_zero()
+const char *test_jacobian_prod_axis_zero(void)
 {
     /* x is 2x3 variable, global index 1, total 8 vars
      * x = [1, 2, 3, 4, 5, 6] (column-major order)
diff --git a/tests/jacobian_tests/test_quad_form.h b/tests/jacobian_tests/test_quad_form.h
index 48345ef..4012c58 100644
--- a/tests/jacobian_tests/test_quad_form.h
+++ b/tests/jacobian_tests/test_quad_form.h
@@ -9,7 +9,7 @@
 #include <stdio.h>
 #include <string.h>
 
-const char *test_quad_form()
+const char *test_quad_form(void)
 {
     // x^T Q x where x is 3 x 1 variable and has global index 2,
     // Q = [1 2 0; 2 3 0; 0 0 4]
@@ -41,7 +41,7 @@ const char *test_quad_form()
 }
 
 /* This test is commented out, see the function eval_jabobian_old in
-src/other/quad_form.c. const char *test_quad_form2()
+src/other/quad_form.c. const char *test_quad_form2(void)
 {
     // (Au)^T Q (Au) where u is 6 x 1,
     //    Q = [1 2 0;
diff --git a/tests/jacobian_tests/test_quad_over_lin.h b/tests/jacobian_tests/test_quad_over_lin.h
index 24a24fa..6e93baf 100644
--- a/tests/jacobian_tests/test_quad_over_lin.h
+++ b/tests/jacobian_tests/test_quad_over_lin.h
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <string.h>
 
-const char *test_quad_over_lin1()
+const char *test_quad_over_lin1(void)
 {
     // var = (z, x, w, y) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 1 x 1
     // we compute jacobian of x^T x / y
@@ -32,7 +32,7 @@ const char *test_quad_over_lin1()
     return 0;
 }
 
-const char *test_quad_over_lin2()
+const char *test_quad_over_lin2(void)
 {
     // var = (z, y, w, x) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 1 x 1
     // we compute jacobian of x^T x / y
@@ -56,7 +56,7 @@ const char *test_quad_over_lin2()
     return 0;
 }
 
-const char *test_quad_over_lin3()
+const char *test_quad_over_lin3(void)
 {
     // var = (z, x, w, y) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 1 x 1
     // we compute jacobian of (Ax)^T(Ax)/y where
@@ -95,7 +95,7 @@ const char *test_quad_over_lin3()
     return 0;
 }
 
-const char *test_quad_over_lin4()
+const char *test_quad_over_lin4(void)
 {
     // var = (z, y, w, x) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 1 x 1
     // we compute jacobian of (Ax)^T(Ax)/y where
@@ -135,7 +135,7 @@ const char *test_quad_over_lin4()
     return 0;
 }
 
-const char *test_quad_over_lin5()
+const char *test_quad_over_lin5(void)
 {
     // var = (z, y, w, x) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 1 x 1
     // we compute jacobian of (Avar)^T(Avar)/y where
diff --git a/tests/jacobian_tests/test_rel_entr.h b/tests/jacobian_tests/test_rel_entr.h
index 6cd5b9f..1c90871 100644
--- a/tests/jacobian_tests/test_rel_entr.h
+++ b/tests/jacobian_tests/test_rel_entr.h
@@ -6,7 +6,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_jacobian_rel_entr_vector_args_1()
+const char *test_jacobian_rel_entr_vector_args_1(void)
 {
     // var = (z, x, w, y) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 3 x 1
     // we compute jacobian of x log(x) - x log(y)
@@ -37,7 +37,7 @@ const char *test_jacobian_rel_entr_vector_args_1()
     return 0;
 }
 
-const char *test_jacobian_rel_entr_vector_args_2()
+const char *test_jacobian_rel_entr_vector_args_2(void)
 {
     // var = (z, y, w, x) where z is 2 x 1, x is 3 x 1, w is 2 x 1, y is 3 x 1
     // we compute jacobian of x log(x) - x log(y)
@@ -68,7 +68,7 @@ const char *test_jacobian_rel_entr_vector_args_2()
     return 0;
 }
 
-const char *test_jacobian_rel_entr_matrix_args()
+const char *test_jacobian_rel_entr_matrix_args(void)
 {
     // x, y are 3 x 2 matrices (vectorized columnwise)
     // x has global variable index 0, y has global variable index 6
diff --git a/tests/jacobian_tests/test_rel_entr_scalar_vector.h b/tests/jacobian_tests/test_rel_entr_scalar_vector.h
index 2f5b402..0009fb5 100644
--- a/tests/jacobian_tests/test_rel_entr_scalar_vector.h
+++ b/tests/jacobian_tests/test_rel_entr_scalar_vector.h
@@ -5,7 +5,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_jacobian_rel_entr_scalar_vector()
+const char *test_jacobian_rel_entr_scalar_vector(void)
 {
     // x is scalar 1, y is 3x1 with values [1, 2, 3]
     double u_vals[4] = {1.0, 1.0, 2.0, 3.0};
diff --git a/tests/jacobian_tests/test_rel_entr_vector_scalar.h b/tests/jacobian_tests/test_rel_entr_vector_scalar.h
index 6ed7a42..aeef25e 100644
--- a/tests/jacobian_tests/test_rel_entr_vector_scalar.h
+++ b/tests/jacobian_tests/test_rel_entr_vector_scalar.h
@@ -5,7 +5,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_jacobian_rel_entr_vector_scalar()
+const char *test_jacobian_rel_entr_vector_scalar(void)
 {
     // x is 3x1 with values [1, 2, 3], y is scalar 4
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
diff --git a/tests/jacobian_tests/test_right_matmul.h b/tests/jacobian_tests/test_right_matmul.h
index 931df44..6155b19 100644
--- a/tests/jacobian_tests/test_right_matmul.h
+++ b/tests/jacobian_tests/test_right_matmul.h
@@ -8,7 +8,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_right_matmul_log()
+const char *test_jacobian_right_matmul_log(void)
 {
     /* Test Jacobian of log(x) @ A where:
      * x is 2x2 variable at x = [[1, 2], [3, 4]] (vectorized column-wise: [1, 3, 2,
@@ -56,7 +56,7 @@ const char *test_jacobian_right_matmul_log()
     return 0;
 }
 
-const char *test_jacobian_right_matmul_log_vector()
+const char *test_jacobian_right_matmul_log_vector(void)
 {
     /* Test Jacobian of log(x) @ A where:
      * x is 1x3 variable at x = [1, 2, 3]
diff --git a/tests/jacobian_tests/test_sum.h b/tests/jacobian_tests/test_sum.h
index d7636c5..e42f47a 100644
--- a/tests/jacobian_tests/test_sum.h
+++ b/tests/jacobian_tests/test_sum.h
@@ -7,7 +7,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_sum_log()
+const char *test_jacobian_sum_log(void)
 {
     /* Test Jacobian of sum(log(x)) where x is 3x1 variable evaluated at [1, 2, 3]
      * Jacobian should be [0, 0, 1/1, 1/2, 1/3, 0] = [0, 0, 1.0, 0.5, 1/3, 0] as a
@@ -32,7 +32,7 @@ const char *test_jacobian_sum_log()
     return 0;
 }
 
-const char *test_jacobian_sum_mult()
+const char *test_jacobian_sum_mult(void)
 {
     /* Test Jacobian of sum(x * y) where both x and y are 3x1 variables
      * x has global variable index 2, y has global variable index 6
@@ -66,7 +66,7 @@ const char *test_jacobian_sum_mult()
     return 0;
 }
 
-const char *test_jacobian_sum_log_axis_0()
+const char *test_jacobian_sum_log_axis_0(void)
 {
     /* Test Jacobian of sum(log(x), axis=0) where x is 3x2 variable,
      * global index 2, total 8 variables
@@ -105,7 +105,7 @@ const char *test_jacobian_sum_log_axis_0()
     free_expr(sum_node);
     return 0;
 }
-const char *test_jacobian_sum_add_log_axis_0()
+const char *test_jacobian_sum_add_log_axis_0(void)
 {
     /* Test Jacobian of sum(add(log(x), log(y)), axis=0) where x and y are 3x2
      * x.value = [[1.0, 2.0],
@@ -152,7 +152,7 @@ const char *test_jacobian_sum_add_log_axis_0()
     free_expr(sum_node);
     return 0;
 }
-const char *test_jacobian_sum_log_axis_1()
+const char *test_jacobian_sum_log_axis_1(void)
 {
     /* Test Jacobian of sum(log(x), axis=1) where x is 3x2 variable,
      * global index 2, total 8 variables
diff --git a/tests/jacobian_tests/test_trace.h b/tests/jacobian_tests/test_trace.h
index b2f9810..1e67afa 100644
--- a/tests/jacobian_tests/test_trace.h
+++ b/tests/jacobian_tests/test_trace.h
@@ -6,7 +6,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_jacobian_trace_variable()
+const char *test_jacobian_trace_variable(void)
 {
     /* Test Jacobian of trace(x) where x is 3x3 variable
      * x has global variable index 1
@@ -47,7 +47,7 @@ const char *test_jacobian_trace_variable()
     return 0;
 }
 
-const char *test_jacobian_trace_composite()
+const char *test_jacobian_trace_composite(void)
 {
     /* Test Jacobian of trace(log(x) + exp(x)) where x is 3x3 variable
      * x has global variable index 1
diff --git a/tests/jacobian_tests/test_transpose.h b/tests/jacobian_tests/test_transpose.h
index 4fd22d5..96ae6b2 100644
--- a/tests/jacobian_tests/test_transpose.h
+++ b/tests/jacobian_tests/test_transpose.h
@@ -8,7 +8,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_jacobian_transpose()
+const char *test_jacobian_transpose(void)
 {
     // A = [1 2; 3 4]
     CSR_Matrix *A = new_csr_matrix(2, 2, 4);
diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h
index a8e819b..89a5f97 100644
--- a/tests/profiling/profile_left_matmul.h
+++ b/tests/profiling/profile_left_matmul.h
@@ -11,7 +11,7 @@
 #include "test_helpers.h"
 #include "utils/Timer.h"
 
-const char *profile_left_matmul()
+const char *profile_left_matmul(void)
 {
     /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */
     int n = 100;
diff --git a/tests/utils/test_cblas.h b/tests/utils/test_cblas.h
new file mode 100644
index 0000000..9d8c0f6
--- /dev/null
+++ b/tests/utils/test_cblas.h
@@ -0,0 +1,18 @@
+#ifndef TEST_CBLAS_H
+#define TEST_CBLAS_H
+
+#include "minunit.h"
+#include "utils/cblas_wrapper.h"
+#include <math.h>
+
+static char *test_cblas_ddot(void)
+{
+    double x[] = {1.0, 2.0, 3.0, 4.0};
+    double y[] = {5.0, 6.0, 7.0, 8.0};
+    double result = cblas_ddot(4, x, 1, y, 1);
+    double expected = 1.0 * 5.0 + 2.0 * 6.0 + 3.0 * 7.0 + 4.0 * 8.0;
+    mu_assert("test_cblas_ddot: wrong dot product", fabs(result - expected) < 1e-12);
+    return 0;
+}
+
+#endif /* TEST_CBLAS_H */
diff --git a/tests/utils/test_coo_matrix.h b/tests/utils/test_coo_matrix.h
index 8c6f14e..53fa307 100644
--- a/tests/utils/test_coo_matrix.h
+++ b/tests/utils/test_coo_matrix.h
@@ -6,7 +6,7 @@
 #include "test_helpers.h"
 #include "utils/COO_Matrix.h"
 
-const char *test_csr_to_coo()
+const char *test_csr_to_coo(void)
 {
     /* Create a 3x3 CSR matrix A:
      * [1.0  2.0  0.0]
@@ -41,7 +41,7 @@ const char *test_csr_to_coo()
     return 0;
 }
 
-const char *test_csr_to_coo_lower_triangular()
+const char *test_csr_to_coo_lower_triangular(void)
 {
     /* Symmetric 3x3 matrix:
      * [1  2  3]
@@ -79,7 +79,7 @@ const char *test_csr_to_coo_lower_triangular()
     return 0;
 }
 
-const char *test_refresh_lower_triangular_coo()
+const char *test_refresh_lower_triangular_coo(void)
 {
     CSR_Matrix *A = new_csr_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
diff --git a/tests/utils/test_csc_matrix.h b/tests/utils/test_csc_matrix.h
index 803f5e6..ed0a304 100644
--- a/tests/utils/test_csc_matrix.h
+++ b/tests/utils/test_csc_matrix.h
@@ -7,7 +7,7 @@
 #include "test_helpers.h"
 #include "utils/CSC_Matrix.h"
 
-const char *test_csr_to_csc1()
+const char *test_csr_to_csc1(void)
 {
     CSR_Matrix *A = new_csr_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
@@ -33,7 +33,7 @@ const char *test_csr_to_csc1()
     return 0;
 }
 
-const char *test_csr_to_csc2()
+const char *test_csr_to_csc2(void)
 {
     CSR_Matrix *A = new_csr_matrix(20, 30, 120);
     double Ax[120] = {9, 6, 5, 9, 7, 3, 8, 2, 6, 1, 3, 9, 2, 8, 9, 1, 4, 9, 2, 1,
@@ -98,7 +98,7 @@ const char *test_csr_to_csc2()
  * [0  x  0]
  * [x  0  x]
  */
-const char *test_ATA_alloc_simple()
+const char *test_ATA_alloc_simple(void)
 {
     CSC_Matrix *A = new_csc_matrix(4, 3, 6);
     int Ap[4] = {0, 2, 3, 6};
@@ -134,7 +134,7 @@ const char *test_ATA_alloc_simple()
  * [2  0  0  4]
  *
  */
-const char *test_ATA_alloc_diagonal_like()
+const char *test_ATA_alloc_diagonal_like(void)
 {
     /* Create A in CSC format (3 rows, 4 cols, 4 nonzeros) */
     CSC_Matrix *A = new_csc_matrix(3, 4, 4);
@@ -157,7 +157,7 @@ const char *test_ATA_alloc_diagonal_like()
     return 0;
 }
 
-const char *test_ATA_alloc_random()
+const char *test_ATA_alloc_random(void)
 {
     /* Create A in CSC format  */
     CSC_Matrix *A = new_csc_matrix(10, 15, 15);
@@ -194,7 +194,7 @@ const char *test_ATA_alloc_random()
     return 0;
 }
 
-const char *test_ATA_alloc_random2()
+const char *test_ATA_alloc_random2(void)
 {
     /* Create A in CSC format  */
     int m = 15;
@@ -231,7 +231,7 @@ const char *test_ATA_alloc_random2()
 
     return 0;
 }
-const char *test_BTA_alloc_and_BTDA_fill()
+const char *test_BTA_alloc_and_BTDA_fill(void)
 {
     /* Create A: 4x3 CSC matrix
      * [1.0  0.0  2.0]
diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h
index c7daeb6..8c0ba10 100644
--- a/tests/utils/test_csr_csc_conversion.h
+++ b/tests/utils/test_csr_csc_conversion.h
@@ -9,7 +9,7 @@
 #include "utils/CSR_Matrix.h"
 
 /* Test CSR to CSC conversion with fill_sparsity and fill_values */
-const char *test_csr_to_csc_split()
+const char *test_csr_to_csc_split(void)
 {
     /* Create a 4x5 CSR matrix A:
      * [1.0  0.0  0.0  0.0  1.0]
@@ -54,7 +54,7 @@ const char *test_csr_to_csc_split()
 }
 
 /* Test CSC to CSR conversion with fill_sparsity */
-const char *test_csc_to_csr_sparsity()
+const char *test_csc_to_csr_sparsity(void)
 {
     /* Create a 4x5 CSC matrix A:
      * [1.0  0.0  0.0  0.0  2.0]
@@ -98,7 +98,7 @@ const char *test_csc_to_csr_sparsity()
 }
 
 /* Test CSC to CSR conversion with fill_values */
-const char *test_csc_to_csr_values()
+const char *test_csc_to_csr_values(void)
 {
     /* Create a 4x5 CSC matrix A */
     CSC_Matrix *A = new_csc_matrix(4, 5, 5);
@@ -131,7 +131,7 @@ const char *test_csc_to_csr_values()
 }
 
 /* Test round-trip conversion: CSR -> CSC -> CSR */
-const char *test_csr_csc_csr_roundtrip()
+const char *test_csr_csc_csr_roundtrip(void)
 {
     /* Create a 3x4 CSR matrix A:
      * [1.0  2.0  0.0  3.0]
diff --git a/tests/utils/test_csr_matrix.h b/tests/utils/test_csr_matrix.h
index 09b2e7d..f6c9536 100644
--- a/tests/utils/test_csr_matrix.h
+++ b/tests/utils/test_csr_matrix.h
@@ -8,7 +8,7 @@
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 
-const char *test_diag_csr_mult()
+const char *test_diag_csr_mult(void)
 {
     /* Create a 3x3 CSR matrix A:
      * [1.0  2.0  0.0]
@@ -51,7 +51,7 @@ const char *test_diag_csr_mult()
 [0  3  0] + [2 0 3] =   [2 3 3]
 [4  0  5]   [0 6 0]     [4 6 5]
 */
-const char *test_csr_sum()
+const char *test_csr_sum(void)
 {
     CSR_Matrix *A = new_csr_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
@@ -93,7 +93,7 @@ const char *test_csr_sum()
 [0  0  3] + [2 0 3] =   [2 0 6]
 [4  0  5]   [0 6 0]     [4 6 5]
 */
-const char *test_csr_sum2()
+const char *test_csr_sum2(void)
 {
     CSR_Matrix *A = new_csr_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
@@ -130,7 +130,7 @@ const char *test_csr_sum2()
     return 0;
 }
 
-const char *test_transpose()
+const char *test_transpose(void)
 {
     CSR_Matrix *A = new_csr_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
@@ -163,7 +163,7 @@ A = [1 0 0 0 1
      2 0 0 0 0
      0 4 0 0 0]
 */
-const char *test_csr_vecmat_values_sparse()
+const char *test_csr_vecmat_values_sparse(void)
 {
     CSR_Matrix *A = new_csr_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
@@ -200,7 +200,7 @@ const char *test_csr_vecmat_values_sparse()
 
     return 0;
 }
-const char *test_sum_all_rows_csr()
+const char *test_sum_all_rows_csr(void)
 {
     /* Create a 3x4 CSR matrix A:
      * [1.0  2.0  0.0  0.0]
@@ -235,7 +235,7 @@ const char *test_sum_all_rows_csr()
 
     return 0;
 }
-const char *test_sum_block_of_rows_csr()
+const char *test_sum_block_of_rows_csr(void)
 {
     /* Create a 9x4 CSR matrix A and sum blocks of size 3
      * Block 0 (rows 0-2):
@@ -314,7 +314,7 @@ const char *test_sum_block_of_rows_csr()
 
     return 0;
 }
-const char *test_sum_evenly_spaced_rows_csr()
+const char *test_sum_evenly_spaced_rows_csr(void)
 {
     /* Create a 9x4 CSR matrix A (same as test_sum_block_of_rows_csr) and sum evenly
      * spaced rows With row_spacing=3:
@@ -392,7 +392,7 @@ const char *test_sum_evenly_spaced_rows_csr()
 
     return 0;
 }
-const char *test_AT_alloc_and_fill()
+const char *test_AT_alloc_and_fill(void)
 {
     /* Create a 3x4 CSR matrix A:
      * [1.0  0.0  2.0  0.0]
@@ -437,7 +437,7 @@ const char *test_AT_alloc_and_fill()
     return 0;
 }
 
-const char *test_kron_identity_csr()
+const char *test_kron_identity_csr(void)
 {
     /* Test A kron I_p where:
      * A is 2x3:
diff --git a/tests/utils/test_linalg_sparse_matmuls.h b/tests/utils/test_linalg_sparse_matmuls.h
index 6fd5bfa..120f99b 100644
--- a/tests/utils/test_linalg_sparse_matmuls.h
+++ b/tests/utils/test_linalg_sparse_matmuls.h
@@ -10,7 +10,7 @@
 #include "utils/linalg_sparse_matmuls.h"
 
 /* Test block_left_multiply_fill_sparsity with simple case: single block */
-const char *test_block_left_multiply_single_block()
+const char *test_block_left_multiply_single_block(void)
 {
     /* A is 2x3 CSR:
      * [1.0  0.0  0.0]
@@ -61,7 +61,7 @@ const char *test_block_left_multiply_single_block()
 }
 
 /* Test block_left_multiply_fill_sparsity with two blocks */
-const char *test_block_left_multiply_two_blocks()
+const char *test_block_left_multiply_two_blocks(void)
 {
     /* A is 2x2 CSR:
      * [1.0  0.0]
@@ -128,7 +128,7 @@ const char *test_block_left_multiply_two_blocks()
 }
 
 /* Test block_left_multiply_fill_sparsity with all zero column in J */
-const char *test_block_left_multiply_zero_column()
+const char *test_block_left_multiply_zero_column(void)
 {
     /* A is 2x2 CSR (identity) */
     CSR_Matrix *A = new_csr_matrix(2, 2, 2);
@@ -167,7 +167,7 @@ const char *test_block_left_multiply_zero_column()
 }
 
 /* Test csr_csc_matmul_alloc: C = A @ B where A is CSR and B is CSC */
-const char *test_csr_csc_matmul_alloc_basic()
+const char *test_csr_csc_matmul_alloc_basic(void)
 {
     /* A is 3x2 CSR:
      * [1.0  0.0]
@@ -215,7 +215,7 @@ const char *test_csr_csc_matmul_alloc_basic()
 }
 
 /* Test csr_csc_matmul_alloc with sparse result */
-const char *test_csr_csc_matmul_alloc_sparse()
+const char *test_csr_csc_matmul_alloc_sparse(void)
 {
     /* A is 2x3 CSR:
      * [1.0  0.0  0.0]
@@ -262,7 +262,7 @@ const char *test_csr_csc_matmul_alloc_sparse()
 }
 
 /* Test block_left_multiply_vec with single block: y = A @ x */
-const char *test_block_left_multiply_vec_single_block()
+const char *test_block_left_multiply_vec_single_block(void)
 {
     /* A is 2x3 CSR:
      * [1.0  0.0  2.0]
@@ -294,7 +294,7 @@ const char *test_block_left_multiply_vec_single_block()
 }
 
 /* Test block_left_multiply_vec with two blocks: y = [A @ x1; A @ x2] */
-const char *test_block_left_multiply_vec_two_blocks()
+const char *test_block_left_multiply_vec_two_blocks(void)
 {
     /* A is 2x3 CSR:
      * [1.0  2.0  0.0]
@@ -328,7 +328,7 @@ const char *test_block_left_multiply_vec_two_blocks()
 }
 
 /* Test block_left_multiply_vec with sparse matrix and multiple blocks */
-const char *test_block_left_multiply_vec_sparse()
+const char *test_block_left_multiply_vec_sparse(void)
 {
     /* A is 3x4 CSR (very sparse):
      * [2.0  0.0  0.0  0.0]
@@ -363,7 +363,7 @@ const char *test_block_left_multiply_vec_sparse()
 }
 
 /* Test block_left_multiply_vec with three blocks */
-const char *test_block_left_multiply_vec_three_blocks()
+const char *test_block_left_multiply_vec_three_blocks(void)
 {
     /* A is 2x2 CSR:
      * [1.0  2.0]
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
new file mode 100644
index 0000000..8add477
--- /dev/null
+++ b/tests/utils/test_matrix.h
@@ -0,0 +1,136 @@
+#ifndef TEST_MATRIX_H
+#define TEST_MATRIX_H
+
+#include "minunit.h"
+#include "test_helpers.h"
+#include "utils/matrix.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* Test dense block_left_mult_vec against known result.
+   A = [1 2; 3 4] (2x2), x = [1; 2], p = 1
+   y = A * x = [1*1+2*2; 3*1+4*2] = [5; 11] */
+const char *test_dense_matrix_mult_vec(void)
+{
+    double data[] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *A = new_dense_matrix(2, 2, data);
+
+    double x[] = {1.0, 2.0};
+    double y[2] = {0.0, 0.0};
+
+    A->block_left_mult_vec(A, x, y, 1);
+
+    double y_expected[2] = {5.0, 11.0};
+    mu_assert("y incorrect", cmp_double_array(y, y_expected, 2));
+
+    free_matrix(A);
+    return 0;
+}
+
+/* Test dense block_left_mult_vec with multiple blocks.
+   A = [1 2; 3 4] (2x2), x = [1; 2; 3; 4], p = 2
+   y = [A*[1;2]; A*[3;4]] = [5; 11; 11; 25] */
+const char *test_dense_matrix_mult_vec_blocks(void)
+{
+    double data[] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *A = new_dense_matrix(2, 2, data);
+
+    double x[] = {1.0, 2.0, 3.0, 4.0};
+    double y[4] = {0};
+
+    A->block_left_mult_vec(A, x, y, 2);
+
+    double y_expected[4] = {5.0, 11.0, 11.0, 25.0};
+    mu_assert("y incorrect", cmp_double_array(y, y_expected, 4));
+
+    free_matrix(A);
+    return 0;
+}
+
+/* Compare sparse vs dense block_left_mult_vec for a non-square matrix.
+   A = [1 2 3; 4 5 6] (2x3), x = [1; 2; 3], p = 1 */
+const char *test_sparse_vs_dense_mult_vec(void)
+{
+    /* Build CSR for A = [1 2 3; 4 5 6] */
+    CSR_Matrix *csr = new_csr_matrix(2, 3, 6);
+    int Ap[3] = {0, 3, 6};
+    int Ai[6] = {0, 1, 2, 0, 1, 2};
+    double Ax[6] = {1, 2, 3, 4, 5, 6};
+    memcpy(csr->p, Ap, 3 * sizeof(int));
+    memcpy(csr->i, Ai, 6 * sizeof(int));
+    memcpy(csr->x, Ax, 6 * sizeof(double));
+
+    double dense_data[] = {1, 2, 3, 4, 5, 6};
+
+    Matrix *sparse = new_sparse_matrix(csr);
+    Matrix *dense = new_dense_matrix(2, 3, dense_data);
+
+    double x[] = {1.0, 2.0, 3.0};
+    double y_sparse[2] = {0};
+    double y_dense[2] = {0};
+
+    sparse->block_left_mult_vec(sparse, x, y_sparse, 1);
+    dense->block_left_mult_vec(dense, x, y_dense, 1);
+
+    mu_assert("sparse vs dense mismatch", cmp_double_array(y_sparse, y_dense, 2));
+
+    free_matrix(sparse);
+    free_matrix(dense);
+    free_csr_matrix(csr);
+    return 0;
+}
+
+/* Test dense transpose */
+const char *test_dense_matrix_trans(void)
+{
+    double data[] = {1, 2, 3, 4, 5, 6}; /* 2x3 */
+    Matrix *A = new_dense_matrix(2, 3, data);
+    Matrix *AT = dense_matrix_trans((const Dense_Matrix *) A);
+
+    mu_assert("transpose m", AT->m == 3);
+    mu_assert("transpose n", AT->n == 2);
+
+    /* AT should be [1 4; 2 5; 3 6] stored row-major */
+    Dense_Matrix *dm = (Dense_Matrix *) AT;
+    double AT_expected[6] = {1.0, 4.0, 2.0, 5.0, 3.0, 6.0};
+    mu_assert("AT vals incorrect", cmp_double_array(dm->x, AT_expected, 6));
+
+    free_matrix(A);
+    free_matrix(AT);
+    return 0;
+}
+
+/* Compare sparse vs dense block_left_mult_vec with p=2 blocks.
+   A = [1 2; 3 4], x = [1; 2; 3; 4], p = 2 */
+const char *test_sparse_vs_dense_mult_vec_blocks(void)
+{
+    CSR_Matrix *csr = new_csr_matrix(2, 2, 4);
+    int Ap[3] = {0, 2, 4};
+    int Ai[4] = {0, 1, 0, 1};
+    double Ax[4] = {1, 2, 3, 4};
+    memcpy(csr->p, Ap, 3 * sizeof(int));
+    memcpy(csr->i, Ai, 4 * sizeof(int));
+    memcpy(csr->x, Ax, 4 * sizeof(double));
+
+    double dense_data[] = {1, 2, 3, 4};
+
+    Matrix *sparse = new_sparse_matrix(csr);
+    Matrix *dense = new_dense_matrix(2, 2, dense_data);
+
+    double x[] = {1.0, 2.0, 3.0, 4.0};
+    double y_sparse[4] = {0};
+    double y_dense[4] = {0};
+
+    sparse->block_left_mult_vec(sparse, x, y_sparse, 2);
+    dense->block_left_mult_vec(dense, x, y_dense, 2);
+
+    mu_assert("sparse vs dense blocks mismatch",
+              cmp_double_array(y_sparse, y_dense, 4));
+
+    free_matrix(sparse);
+    free_matrix(dense);
+    free_csr_matrix(csr);
+    return 0;
+}
+
+#endif /* TEST_MATRIX_H */
diff --git a/tests/wsum_hess/elementwise/test_entr.h b/tests/wsum_hess/elementwise/test_entr.h
index 6aa818c..b761e9a 100644
--- a/tests/wsum_hess/elementwise/test_entr.h
+++ b/tests/wsum_hess/elementwise/test_entr.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_entr()
+const char *test_wsum_hess_entr(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
diff --git a/tests/wsum_hess/elementwise/test_exp.h b/tests/wsum_hess/elementwise/test_exp.h
index 9ed6323..a549b0a 100644
--- a/tests/wsum_hess/elementwise/test_exp.h
+++ b/tests/wsum_hess/elementwise/test_exp.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_exp()
+const char *test_wsum_hess_exp(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
diff --git a/tests/wsum_hess/elementwise/test_hyperbolic.h b/tests/wsum_hess/elementwise/test_hyperbolic.h
index 7af436a..5ffb233 100644
--- a/tests/wsum_hess/elementwise/test_hyperbolic.h
+++ b/tests/wsum_hess/elementwise/test_hyperbolic.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_sinh()
+const char *test_wsum_hess_sinh(void)
 {
     /* Test: wsum_hess of sinh(x) where x = [1, 2, 3] (3x1) at global variable index
      * 0 Total 3 variables, weight w = [1, 2, 3]
@@ -44,7 +44,7 @@ const char *test_wsum_hess_sinh()
     return 0;
 }
 
-const char *test_wsum_hess_tanh()
+const char *test_wsum_hess_tanh(void)
 {
     /* Test: wsum_hess of tanh(x) where x = [1, 2, 3] (3x1) at global variable index
      * 0 Total 3 variables, weight w = [1, 2, 3]
@@ -81,7 +81,7 @@ const char *test_wsum_hess_tanh()
     return 0;
 }
 
-const char *test_wsum_hess_asinh()
+const char *test_wsum_hess_asinh(void)
 {
     /* Test: wsum_hess of asinh(x) where x = [1, 2, 3] (3x1) at global variable index
      * 0 Total 3 variables, weight w = [1, 2, 3]
@@ -118,7 +118,7 @@ const char *test_wsum_hess_asinh()
     return 0;
 }
 
-const char *test_wsum_hess_atanh()
+const char *test_wsum_hess_atanh(void)
 {
     /* Test: wsum_hess of atanh(x) where x = [0.1, 0.2, 0.3] (3x1) at global variable
      * index 0 Total 3 variables, weight w = [1, 2, 3]
diff --git a/tests/wsum_hess/elementwise/test_log.h b/tests/wsum_hess/elementwise/test_log.h
index 5df089a..4c1af91 100644
--- a/tests/wsum_hess/elementwise/test_log.h
+++ b/tests/wsum_hess/elementwise/test_log.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_log()
+const char *test_wsum_hess_log(void)
 {
     /* Test: wsum_hess of log(x) where x = [1, 2, 3] (3x1) at global variable index 2
      * Total 7 variables, weight w = [1, 2, 3]
@@ -48,7 +48,7 @@ const char *test_wsum_hess_log()
     return 0;
 }
 
-const char *test_wsum_hess_log_composite()
+const char *test_wsum_hess_log_composite(void)
 {
     double u_vals[5] = {1, 2, 3, 4, 5};
     double w[3] = {-1, -2, -3};
diff --git a/tests/wsum_hess/elementwise/test_logistic.h b/tests/wsum_hess/elementwise/test_logistic.h
index 714fe40..ecf8d2b 100644
--- a/tests/wsum_hess/elementwise/test_logistic.h
+++ b/tests/wsum_hess/elementwise/test_logistic.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_logistic()
+const char *test_wsum_hess_logistic(void)
 {
     /* Test: wsum_hess of logistic(x) where x = [1, 2, 3] (3x1) at global variable
      * index 0 Total 3 variables, weight w = [1, 2, 3]
diff --git a/tests/wsum_hess/elementwise/test_power.h b/tests/wsum_hess/elementwise/test_power.h
index d830a9d..76044a6 100644
--- a/tests/wsum_hess/elementwise/test_power.h
+++ b/tests/wsum_hess/elementwise/test_power.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_power()
+const char *test_wsum_hess_power(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
diff --git a/tests/wsum_hess/elementwise/test_trig.h b/tests/wsum_hess/elementwise/test_trig.h
index fd84dc4..e0ed0cf 100644
--- a/tests/wsum_hess/elementwise/test_trig.h
+++ b/tests/wsum_hess/elementwise/test_trig.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_sin()
+const char *test_wsum_hess_sin(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
@@ -37,7 +37,7 @@ const char *test_wsum_hess_sin()
     return 0;
 }
 
-const char *test_wsum_hess_cos()
+const char *test_wsum_hess_cos(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
@@ -65,7 +65,7 @@ const char *test_wsum_hess_cos()
     return 0;
 }
 
-const char *test_wsum_hess_tan()
+const char *test_wsum_hess_tan(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
diff --git a/tests/wsum_hess/elementwise/test_xexp.h b/tests/wsum_hess/elementwise/test_xexp.h
index 0550571..332785f 100644
--- a/tests/wsum_hess/elementwise/test_xexp.h
+++ b/tests/wsum_hess/elementwise/test_xexp.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_xexp()
+const char *test_wsum_hess_xexp(void)
 {
     double u_vals[3] = {1.0, 2.0, 3.0};
     double w[3] = {1.0, 2.0, 3.0};
diff --git a/tests/wsum_hess/test_broadcast.h b/tests/wsum_hess/test_broadcast.h
index 54ba36c..edccae0 100644
--- a/tests/wsum_hess/test_broadcast.h
+++ b/tests/wsum_hess/test_broadcast.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_broadcast_row()
+const char *test_wsum_hess_broadcast_row(void)
 {
     /* Test: wsum_hess of broadcast_row(log(x)) where x is (1, 3)
      * x = [1.0, 2.0, 4.0] (row vector)
@@ -66,7 +66,7 @@ const char *test_wsum_hess_broadcast_row()
     return 0;
 }
 
-const char *test_wsum_hess_broadcast_col()
+const char *test_wsum_hess_broadcast_col(void)
 {
     /* Test: wsum_hess of broadcast_col(log(x)) where x is (3, 1)
      * x = [1.0, 2.0, 4.0]^T (column vector)
@@ -121,7 +121,7 @@ const char *test_wsum_hess_broadcast_col()
     return 0;
 }
 
-const char *test_wsum_hess_broadcast_scalar_to_matrix()
+const char *test_wsum_hess_broadcast_scalar_to_matrix(void)
 {
     /* Test: wsum_hess of broadcast_scalar(log(x)) where x is scalar (1, 1)
      * x = 5.0
diff --git a/tests/wsum_hess/test_const_scalar_mult.h b/tests/wsum_hess/test_const_scalar_mult.h
index 7172be4..34ccb7c 100644
--- a/tests/wsum_hess/test_const_scalar_mult.h
+++ b/tests/wsum_hess/test_const_scalar_mult.h
@@ -9,7 +9,7 @@
 
 /* Test: y = a * log(x) where a is a scalar constant */
 
-const char *test_wsum_hess_const_scalar_mult_log_vector()
+const char *test_wsum_hess_const_scalar_mult_log_vector(void)
 {
     /* Create variable x: [1.0, 2.0, 4.0] */
     double u_vals[3] = {1.0, 2.0, 4.0};
@@ -54,7 +54,7 @@ const char *test_wsum_hess_const_scalar_mult_log_vector()
     return 0;
 }
 
-const char *test_wsum_hess_const_scalar_mult_log_matrix()
+const char *test_wsum_hess_const_scalar_mult_log_matrix(void)
 {
     /* Create variable x as 2x2 matrix: [[1.0, 2.0], [4.0, 8.0]] */
     double u_vals[4] = {1.0, 2.0, 4.0, 8.0};
diff --git a/tests/wsum_hess/test_const_vector_mult.h b/tests/wsum_hess/test_const_vector_mult.h
index 88bc127..b0a75b5 100644
--- a/tests/wsum_hess/test_const_vector_mult.h
+++ b/tests/wsum_hess/test_const_vector_mult.h
@@ -9,7 +9,7 @@
 
 /* Test: y = a ∘ log(x) where a is a constant vector */
 
-const char *test_wsum_hess_const_vector_mult_log_vector()
+const char *test_wsum_hess_const_vector_mult_log_vector(void)
 {
     /* Create variable x: [1.0, 2.0, 4.0] */
     double u_vals[3] = {1.0, 2.0, 4.0};
@@ -52,7 +52,7 @@ const char *test_wsum_hess_const_vector_mult_log_vector()
     return 0;
 }
 
-const char *test_wsum_hess_const_vector_mult_log_matrix()
+const char *test_wsum_hess_const_vector_mult_log_matrix(void)
 {
     /* Create variable x as 2x2 matrix: [[1.0, 2.0], [4.0, 8.0]] */
     double u_vals[4] = {1.0, 2.0, 4.0, 8.0};
diff --git a/tests/wsum_hess/test_hstack.h b/tests/wsum_hess/test_hstack.h
index 49c22b2..a94ec7c 100644
--- a/tests/wsum_hess/test_hstack.h
+++ b/tests/wsum_hess/test_hstack.h
@@ -6,7 +6,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_hstack()
+const char *test_wsum_hess_hstack(void)
 {
     /* Test: hstack([log(x), log(z), exp(x), sin(y)])
      * Variables: x at idx 0, z at idx 3, y at idx 6
@@ -100,7 +100,7 @@ const char *test_wsum_hess_hstack()
     return 0;
 }
 
-const char *test_wsum_hess_hstack_matrix()
+const char *test_wsum_hess_hstack_matrix(void)
 {
     /* Test: hstack([log(x), log(z), exp(x), sin(y)]) with matrix variables
      * Variables: x at idx 0, z at idx 6, y at idx 12
diff --git a/tests/wsum_hess/test_left_matmul.h b/tests/wsum_hess/test_left_matmul.h
index 28d1cec..00f587e 100644
--- a/tests/wsum_hess/test_left_matmul.h
+++ b/tests/wsum_hess/test_left_matmul.h
@@ -10,7 +10,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_left_matmul()
+const char *test_wsum_hess_left_matmul(void)
 {
     /* Test weighted sum of Hessian of A @ log(x) where:
      * x is 3x1 variable at x = [1, 2, 3]
@@ -89,7 +89,7 @@ const char *test_wsum_hess_left_matmul()
     return 0;
 }
 
-const char *test_wsum_hess_left_matmul_composite()
+const char *test_wsum_hess_left_matmul_composite(void)
 {
     /* Test weighted sum of Hessian of A @ log(B @ x) where:
      * x is 3x1 variable at x = [1, 2, 3]
@@ -192,7 +192,7 @@ const char *test_wsum_hess_left_matmul_composite()
     return 0;
 }
 
-const char *test_wsum_hess_left_matmul_matrix()
+const char *test_wsum_hess_left_matmul_matrix(void)
 {
     /* Test weighted sum of Hessian of A @ log(x) where:
      * x is 3x2 variable, vectorized column-wise: [1, 2, 3, 4, 5, 6]
diff --git a/tests/wsum_hess/test_matmul.h b/tests/wsum_hess/test_matmul.h
index a6b74df..796f3b2 100644
--- a/tests/wsum_hess/test_matmul.h
+++ b/tests/wsum_hess/test_matmul.h
@@ -5,7 +5,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_matmul()
+const char *test_wsum_hess_matmul(void)
 {
     /* Test: Z = X @ Y where X is 2x3, Y is 3x4
      * var = (X, Y) where X starts at index 0 (size 6), Y starts at index 6 (size 12)
@@ -106,7 +106,7 @@ const char *test_wsum_hess_matmul()
     return 0;
 }
 
-const char *test_wsum_hess_matmul_yx()
+const char *test_wsum_hess_matmul_yx(void)
 {
     /* Test: Z = X @ Y where X is 2x3, Y is 3x4
      * var = (Y, X) where Y starts at index 0 (size 12), X starts at index 12 (size
diff --git a/tests/wsum_hess/test_multiply.h b/tests/wsum_hess/test_multiply.h
index ed95c67..7f427bd 100644
--- a/tests/wsum_hess/test_multiply.h
+++ b/tests/wsum_hess/test_multiply.h
@@ -6,7 +6,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_multiply_1()
+const char *test_wsum_hess_multiply_1(void)
 {
     // Total 12 variables: [?, ?, ?, x0, x1, x2, ?, ?, y0, y1, y2, ?]
     // x has var_id = 3, y has var_id = 8
@@ -37,7 +37,7 @@ const char *test_wsum_hess_multiply_1()
     return 0;
 }
 
-const char *test_wsum_hess_multiply_sparse_random()
+const char *test_wsum_hess_multiply_sparse_random(void)
 {
     /* Test with larger random sparse matrices
      * A: 5x10 CSR matrix
@@ -109,7 +109,7 @@ const char *test_wsum_hess_multiply_sparse_random()
     return 0;
 }
 
-const char *test_wsum_hess_multiply_linear_ops()
+const char *test_wsum_hess_multiply_linear_ops(void)
 {
     /* Test Hessian for mult(Ax, Bx) where A, B are 4x3 linear operators
      * A = [[1.0, 0.0, 2.0],
@@ -191,7 +191,7 @@ const char *test_wsum_hess_multiply_linear_ops()
     return 0;
 }
 
-const char *test_wsum_hess_multiply_2()
+const char *test_wsum_hess_multiply_2(void)
 {
     // Total 12 variables: [?, ?, ?, y0, y1, y2, ?, ?, x0, x1, x2, ?]
     // y has var_id = 3, x has var_id = 8
diff --git a/tests/wsum_hess/test_prod.h b/tests/wsum_hess/test_prod.h
index 33a41b0..915edae 100644
--- a/tests/wsum_hess/test_prod.h
+++ b/tests/wsum_hess/test_prod.h
@@ -8,7 +8,7 @@
 
 /* Common setup: x is 4x1 variable, global index 2, total 8 vars */
 
-const char *test_wsum_hess_prod_no_zero()
+const char *test_wsum_hess_prod_no_zero(void)
 {
     /* x = [1, 2, 3, 4], f = 24 */
     double u_vals[8] = {0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 0.0, 0.0};
@@ -35,7 +35,7 @@ const char *test_wsum_hess_prod_no_zero()
     return 0;
 }
 
-const char *test_wsum_hess_prod_one_zero()
+const char *test_wsum_hess_prod_one_zero(void)
 {
     /* x = [1, 0, 3, 4], zero at index 1, prod_nonzero = 12 */
     double u_vals[8] = {0.0, 0.0, 1.0, 0.0, 3.0, 4.0, 0.0, 0.0};
@@ -68,7 +68,7 @@ const char *test_wsum_hess_prod_one_zero()
     return 0;
 }
 
-const char *test_wsum_hess_prod_two_zeros()
+const char *test_wsum_hess_prod_two_zeros(void)
 {
     /* x = [1, 0, 0, 4], zeros at 1 and 2, prod over others = 4 */
     double u_vals[8] = {0.0, 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0};
@@ -96,7 +96,7 @@ const char *test_wsum_hess_prod_two_zeros()
     return 0;
 }
 
-const char *test_wsum_hess_prod_many_zeros()
+const char *test_wsum_hess_prod_many_zeros(void)
 {
     /* x = [0, 0, 0, 4], three zeros => Hessian all zeros */
     double u_vals[8] = {0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0};
diff --git a/tests/wsum_hess/test_prod_axis_one.h b/tests/wsum_hess/test_prod_axis_one.h
index 18524e9..eb94e79 100644
--- a/tests/wsum_hess/test_prod_axis_one.h
+++ b/tests/wsum_hess/test_prod_axis_one.h
@@ -6,7 +6,7 @@
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_prod_axis_one_no_zeros()
+const char *test_wsum_hess_prod_axis_one_no_zeros(void)
 {
     /* x is 2x3 variable, global index 1, total 8 vars
      * x = [1, 2, 3, 4, 5, 6] (column-major)
@@ -72,7 +72,7 @@ const char *test_wsum_hess_prod_axis_one_no_zeros()
     return 0;
 }
 
-const char *test_wsum_hess_prod_axis_one_one_zero()
+const char *test_wsum_hess_prod_axis_one_one_zero(void)
 {
     /* x is 3x3 variable, global index 1, total 10 vars
      * x = [1, 2, 3, 4, 0, 6, 7, 8, 9] (column-major)
@@ -171,7 +171,7 @@ const char *test_wsum_hess_prod_axis_one_one_zero()
     return 0;
 }
 
-const char *test_wsum_hess_prod_axis_one_mixed_zeros()
+const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
 {
     /* x is 5x3 variable, global index 1, total 16 vars
      * Rows (axis=1 products):
@@ -321,7 +321,7 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros()
     free_expr(p);
     return 0;
 }
-const char *test_wsum_hess_prod_axis_one_2x2()
+const char *test_wsum_hess_prod_axis_one_2x2(void)
 {
     /* x is 2x2 variable, global index 0, total 4 vars
      * x = [2, 1, 3, 2] (column-major)
diff --git a/tests/wsum_hess/test_prod_axis_zero.h b/tests/wsum_hess/test_prod_axis_zero.h
index e81f97d..70eace1 100644
--- a/tests/wsum_hess/test_prod_axis_zero.h
+++ b/tests/wsum_hess/test_prod_axis_zero.h
@@ -6,7 +6,7 @@
 #include "other.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_prod_axis_zero_no_zeros()
+const char *test_wsum_hess_prod_axis_zero_no_zeros(void)
 {
     /* x is 2x3 variable, global index 1, total 8 vars
      * x = [1, 2, 3, 4, 5, 6] (column-major)
@@ -70,7 +70,7 @@ const char *test_wsum_hess_prod_axis_zero_no_zeros()
     return 0;
 }
 
-const char *test_wsum_hess_prod_axis_zero_mixed_zeros()
+const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
 {
     /* x is 5x3 variable, global index 1, total 16 vars
      * x = [1, 1, 1, 1, 1, 2, 0, 3, 4, 5, 1, 0, 0, 2, 3] (column-major)
@@ -189,7 +189,7 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros()
     return 0;
 }
 
-const char *test_wsum_hess_prod_axis_zero_one_zero()
+const char *test_wsum_hess_prod_axis_zero_one_zero(void)
 {
     /* Test with a column that has exactly 1 zero
      * x is 2x2 variable, global index 1, total 5 vars
diff --git a/tests/wsum_hess/test_quad_form.h b/tests/wsum_hess/test_quad_form.h
index e02233c..7342679 100644
--- a/tests/wsum_hess/test_quad_form.h
+++ b/tests/wsum_hess/test_quad_form.h
@@ -5,7 +5,7 @@
 #include "test_helpers.h"
 #include <string.h>
 
-const char *test_wsum_hess_quad_form()
+const char *test_wsum_hess_quad_form(void)
 {
     // x has var_id = 3, dimension 4, total variables = 10
     double u_vals[10] = {0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0};
diff --git a/tests/wsum_hess/test_quad_over_lin.h b/tests/wsum_hess/test_quad_over_lin.h
index 94d5a6a..f893659 100644
--- a/tests/wsum_hess/test_quad_over_lin.h
+++ b/tests/wsum_hess/test_quad_over_lin.h
@@ -4,7 +4,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_quad_over_lin_xy()
+const char *test_wsum_hess_quad_over_lin_xy(void)
 {
     /* x^T x / y with x var_id=2 (3 vars), y var_id=7, total n_vars=9
      * x = [1, 2, 3], y = 4, w = 2
@@ -33,7 +33,7 @@ const char *test_wsum_hess_quad_over_lin_xy()
     return 0;
 }
 
-const char *test_wsum_hess_quad_over_lin_yx()
+const char *test_wsum_hess_quad_over_lin_yx(void)
 {
     /* x^T x / y with y var_id=2, x var_id=5 (3 vars), total n_vars=9
      * x = [1, 2, 3], y = 4, w = 2
diff --git a/tests/wsum_hess/test_rel_entr.h b/tests/wsum_hess/test_rel_entr.h
index 3516882..ec69385 100644
--- a/tests/wsum_hess/test_rel_entr.h
+++ b/tests/wsum_hess/test_rel_entr.h
@@ -6,7 +6,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_rel_entr_1()
+const char *test_wsum_hess_rel_entr_1(void)
 {
     // Total 10 variables: [?, x0, x1, x2, ?, ?, y0, y1, y2, ?]
     // x has var_id = 1, y has var_id = 6
@@ -37,7 +37,7 @@ const char *test_wsum_hess_rel_entr_1()
     return 0;
 }
 
-const char *test_wsum_hess_rel_entr_2()
+const char *test_wsum_hess_rel_entr_2(void)
 {
     // Total 10 variables: [?, y0, y1, y2, ?, ?, x0, x1, x2, ?]
     // y has var_id = 1, x has var_id = 6
@@ -68,7 +68,7 @@ const char *test_wsum_hess_rel_entr_2()
     return 0;
 }
 
-const char *test_wsum_hess_rel_entr_matrix()
+const char *test_wsum_hess_rel_entr_matrix(void)
 {
     // x, y are 3 x 2 matrices (vectorized columnwise)
     // x has var_id = 0, y has var_id = 6
diff --git a/tests/wsum_hess/test_rel_entr_scalar_vector.h b/tests/wsum_hess/test_rel_entr_scalar_vector.h
index 3e1db8a..78d2d22 100644
--- a/tests/wsum_hess/test_rel_entr_scalar_vector.h
+++ b/tests/wsum_hess/test_rel_entr_scalar_vector.h
@@ -5,7 +5,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_rel_entr_scalar_vector()
+const char *test_wsum_hess_rel_entr_scalar_vector(void)
 {
     // x is scalar 1, y is 3x1 with values [2, 3, 4], w = [4, 5, 6]
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
diff --git a/tests/wsum_hess/test_rel_entr_vector_scalar.h b/tests/wsum_hess/test_rel_entr_vector_scalar.h
index 83e6a53..3ce1553 100644
--- a/tests/wsum_hess/test_rel_entr_vector_scalar.h
+++ b/tests/wsum_hess/test_rel_entr_vector_scalar.h
@@ -5,7 +5,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_rel_entr_vector_scalar()
+const char *test_wsum_hess_rel_entr_vector_scalar(void)
 {
     // x is 3x1 with values [1, 2, 3], y is scalar 4, w = [1, 2, 3]
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
diff --git a/tests/wsum_hess/test_right_matmul.h b/tests/wsum_hess/test_right_matmul.h
index ca109f8..660e823 100644
--- a/tests/wsum_hess/test_right_matmul.h
+++ b/tests/wsum_hess/test_right_matmul.h
@@ -10,7 +10,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_right_matmul()
+const char *test_wsum_hess_right_matmul(void)
 {
     /* Test weighted sum of Hessian of log(x) @ A where:
      * x is 2x2 variable at x = [[1, 2], [3, 4]] (vectorized column-wise: [1, 3, 2,
@@ -60,7 +60,7 @@ const char *test_wsum_hess_right_matmul()
     return 0;
 }
 
-const char *test_wsum_hess_right_matmul_vector()
+const char *test_wsum_hess_right_matmul_vector(void)
 {
     /* Test weighted sum of Hessian of log(x) @ A where:
      * x is 1x3 variable at x = [1, 2, 3]
diff --git a/tests/wsum_hess/test_sum.h b/tests/wsum_hess/test_sum.h
index e6aff23..468b07b 100644
--- a/tests/wsum_hess/test_sum.h
+++ b/tests/wsum_hess/test_sum.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_sum_log_linear()
+const char *test_wsum_hess_sum_log_linear(void)
 {
     double Ax[6] = {1, 1, 2, 3, 1, -1};
     int Ai[6] = {0, 1, 0, 1, 0, 1};
@@ -50,7 +50,7 @@ const char *test_wsum_hess_sum_log_linear()
     return 0;
 }
 
-const char *test_wsum_hess_sum_log_axis0()
+const char *test_wsum_hess_sum_log_axis0(void)
 {
     /* Test: wsum_hess of sum(log(x), axis=0) where x is 3x2
      * x = [[1, 4],
@@ -88,7 +88,7 @@ const char *test_wsum_hess_sum_log_axis0()
     return 0;
 }
 
-const char *test_wsum_hess_sum_log_axis1()
+const char *test_wsum_hess_sum_log_axis1(void)
 {
     /* Test: wsum_hess of sum(log(x), axis=1) where x is 3x2
      * x = [[1, 4],
diff --git a/tests/wsum_hess/test_trace.h b/tests/wsum_hess/test_trace.h
index 901a8c5..023fbf7 100644
--- a/tests/wsum_hess/test_trace.h
+++ b/tests/wsum_hess/test_trace.h
@@ -9,7 +9,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 
-const char *test_wsum_hess_trace_variable()
+const char *test_wsum_hess_trace_variable(void)
 {
     /* Test weighted sum of Hessian of trace(x) where x is 3x3 variable
      * x has global variable index 1
@@ -43,7 +43,7 @@ const char *test_wsum_hess_trace_variable()
     return 0;
 }
 
-const char *test_wsum_hess_trace_log_variable()
+const char *test_wsum_hess_trace_log_variable(void)
 {
     /* Test weighted sum of Hessian of trace(log(x)) where x is 3x3 variable
      * x has global variable index 1
@@ -79,7 +79,7 @@ const char *test_wsum_hess_trace_log_variable()
     return 0;
 }
 
-const char *test_wsum_hess_trace_composite()
+const char *test_wsum_hess_trace_composite(void)
 {
     /* Test weighted sum of Hessian of trace(log(x) + exp(x)) where x is 3x3 variable
      * x has global variable index 1
diff --git a/tests/wsum_hess/test_transpose.h b/tests/wsum_hess/test_transpose.h
index 4e113f6..2ff2fc4 100644
--- a/tests/wsum_hess/test_transpose.h
+++ b/tests/wsum_hess/test_transpose.h
@@ -7,7 +7,7 @@
 #include <math.h>
 #include <stdio.h>
 
-const char *test_wsum_hess_transpose()
+const char *test_wsum_hess_transpose(void)
 {
 
     expr *X = new_variable(2, 2, 0, 8);