diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ce9857..e9c4d9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,8 @@ set_property(TARGET dnlp_diff PROPERTY POSITION_INDEPENDENT_CODE ON)
 # =============================================================================
 # C tests (only for standalone builds)
 # =============================================================================
+option(PROFILE_ONLY "Build only profiling tests" OFF)
+
 if(NOT SKBUILD)
     include_directories(${PROJECT_SOURCE_DIR}/tests)
     enable_testing()
@@ -90,5 +92,11 @@ if(NOT SKBUILD)
         tests/test_helpers.c
     )
     target_link_libraries(all_tests dnlp_diff)
+    
+    if(PROFILE_ONLY)
+        target_compile_definitions(all_tests PRIVATE PROFILE_ONLY)
+        message(STATUS "Building ONLY profiling tests")
+    endif()
+    
     add_test(NAME AllTests COMMAND all_tests)
 endif()
diff --git a/include/subexpr.h b/include/subexpr.h
index a663282..b3f4170 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -109,7 +109,10 @@ typedef struct left_matmul_expr
     expr base;
     CSR_Matrix *A;
     CSR_Matrix *AT;
-    CSC_Matrix *CSC_work;
+    int n_blocks;
+    CSC_Matrix *Jchild_CSC;
+    CSC_Matrix *J_CSC;
+    int *csc_to_csr_workspace;
 } left_matmul_expr;
 
 /* Right matrix multiplication: y = f(x) * A where f(x) is an expression.
diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_Matrix.h
index ed044dc..c444dcc 100644
--- a/include/utils/CSC_Matrix.h
+++ b/include/utils/CSC_Matrix.h
@@ -52,15 +52,7 @@ void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C)
 CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork);
 void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);
 
-/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC
- * Precomputes sparsity pattern. No workspace required.
- */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);
-
-/* Fill values of C = A @ B where A is CSR, B is CSC
- * C must have sparsity pattern already computed
- */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C);
+CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork);
+void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork);
 
 #endif /* CSC_MATRIX_H */
diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_Matrix.h
index 2e98abf..8c3f523 100644
--- a/include/utils/CSR_Matrix.h
+++ b/include/utils/CSR_Matrix.h
@@ -27,6 +27,7 @@ typedef struct CSR_Matrix
 
 /* Allocate a new CSR matrix with given dimensions and nnz */
 CSR_Matrix *new_csr_matrix(int m, int n, int nnz);
+CSR_Matrix *new_csr(const CSR_Matrix *A);
 
 /* Free a CSR matrix */
 void free_csr_matrix(CSR_Matrix *matrix);
diff --git a/include/utils/linalg.h b/include/utils/linalg.h
new file mode 100644
index 0000000..25890a9
--- /dev/null
+++ b/include/utils/linalg.h
@@ -0,0 +1,46 @@
+#ifndef LINALG_H
+#define LINALG_H
+
+/* Forward declarations */
+struct CSR_Matrix;
+struct CSC_Matrix;
+
+/* Compute sparsity pattern and values for the matrix-matrix multiplication
+   C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k,
+   without relying on generic sparse matrix-matrix multiplication. Specialized
+   logic for this is much faster (50-100x) than generic sparse matmul.
+
+    * J is provided in CSC format and is split into p blocks of n rows each
+    * C is returned in CSC format
+    * Mathematically it corresponds to  C = [A @ J1; A @ J2; ...; A @ Jp],
+      where J = [J1; J2; ...; Jp]
+*/
+struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A,
+                                                     const struct CSC_Matrix *J,
+                                                     int p);
+
+void block_left_multiply_fill_values(const struct CSR_Matrix *A,
+                                     const struct CSC_Matrix *J,
+                                     struct CSC_Matrix *C);
+
+/* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector.
+   The output y is m*p-length vector corresponding to
+   y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n
+   elements.
+*/
+void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
+                             int p);
+
+/* Fill values of C = A @ B where A is CSR, B is CSC.
+ * C must have sparsity pattern already computed.
+ */
+void csr_csc_matmul_fill_values(const struct CSR_Matrix *A,
+                                const struct CSC_Matrix *B, struct CSR_Matrix *C);
+
+/* C = A @ B where A is CSR, B is CSC. Result C is CSR.
+ * Allocates and precomputes sparsity pattern. No workspace required.
+ */
+struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A,
+                                        const struct CSC_Matrix *B);
+
+#endif /* LINALG_H */
diff --git a/src/affine/broadcast.c b/src/affine/broadcast.c
index 870e353..c16fe10 100644
--- a/src/affine/broadcast.c
+++ b/src/affine/broadcast.c
@@ -89,7 +89,7 @@ static void jacobian_init(expr *node)
     else
     {
         /* Scalar broadcast: (1, 1) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->d1 * node->d2;
+        total_nnz = x->jacobian->nnz * node->size;
     }
 
     node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz);
@@ -99,10 +99,10 @@ static void jacobian_init(expr *node)
     // ---------------------------------------------------------------------
     CSR_Matrix *Jx = x->jacobian;
     CSR_Matrix *J = node->jacobian;
-    J->nnz = 0;
 
     if (bcast->type == BROADCAST_ROW)
     {
+        J->nnz = 0;
         for (int i = 0; i < node->d2; i++)
         {
             int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
@@ -117,11 +117,11 @@ static void jacobian_init(expr *node)
                 J->nnz += nnz_in_row;
             }
         }
+        assert(J->nnz == total_nnz);
         J->p[node->size] = total_nnz;
     }
     else if (bcast->type == BROADCAST_COL)
     {
-
         /* copy column indices */
         tile_int(J->i, Jx->i, Jx->nnz, node->d2);
 
@@ -129,10 +129,11 @@ static void jacobian_init(expr *node)
         int offset = 0;
         for (int i = 0; i < node->d2; i++)
         {
+            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
             for (int j = 0; j < node->d1; j++)
             {
                 J->p[i * node->d1 + j] = offset;
-                offset += Jx->p[1] - Jx->p[0];
+                offset += nnz_in_row;
             }
         }
         assert(offset == total_nnz);
@@ -141,12 +142,12 @@ static void jacobian_init(expr *node)
     else
     {
         /* copy column indices */
-        tile_int(J->i, Jx->i, Jx->nnz, node->d1 * node->d2);
+        tile_int(J->i, Jx->i, Jx->nnz, node->size);
 
         /* set row pointers */
         int offset = 0;
         int nnz = Jx->p[1] - Jx->p[0];
-        for (int i = 0; i < node->d1 * node->d2; i++)
+        for (int i = 0; i < node->size; i++)
         {
             J->p[i] = offset;
             offset += nnz;
@@ -163,10 +164,10 @@ static void eval_jacobian(expr *node)
     broadcast_expr *bcast = (broadcast_expr *) node;
     CSR_Matrix *Jx = node->left->jacobian;
     CSR_Matrix *J = node->jacobian;
-    J->nnz = 0;
 
     if (bcast->type == BROADCAST_ROW)
     {
+        J->nnz = 0;
         for (int i = 0; i < node->d2; i++)
         {
             int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
@@ -180,7 +181,7 @@ static void eval_jacobian(expr *node)
     }
     else
     {
-        tile_double(J->x, Jx->x, Jx->nnz, node->d1 * node->d2);
+        tile_double(J->x, Jx->x, Jx->nnz, node->size);
     }
 }
 
@@ -268,9 +269,9 @@ expr *new_broadcast(expr *child, int d1, int d2)
     }
     else
     {
-        fprintf(
-            stderr,
-            "ERROR: inconsistency of broadcasting between DNLP-diff and CVXPY. \n");
+        fprintf(stderr,
+                "ERROR: inconsistency of broadcasting between SparseDifferentiation"
+                " and CVXPY. \n");
         exit(1);
     }
 
diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c
index 92dd762..7b1baee 100644
--- a/src/bivariate/left_matmul.c
+++ b/src/bivariate/left_matmul.c
@@ -17,6 +17,8 @@
  */
 #include "bivariate.h"
 #include "subexpr.h"
+#include "utils/Timer.h"
+#include "utils/linalg.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -31,7 +33,9 @@
    * To compute the forward pass: vec(y) = A_kron @ vec(f(x)),
      where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p),
      or more specificely, a block-diagonal matrix with p blocks of A along the
-     diagonal.
+     diagonal. In the refactored implementation we don't form A_kron explicitly,
+     only conceptually. This led to a 100x speedup in the initialization of the
+     Jacobian sparsity pattern.
 
    * To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the
      Jacobian of f(x) of size (n*p) x n_vars.
@@ -42,7 +46,8 @@
     Working in terms of A_kron unifies the implementation of f(x) being
     vector-valued or matrix-valued.
 
-
+    I (dance858) think we can get additional big speedups when A is dense by
+    introducing a dense matrix class.
 */
 
 #include "utils/utils.h"
@@ -55,7 +60,9 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A_kron @ vec(f(x)) */
-    csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value);
+    CSR_Matrix *A = ((left_matmul_expr *) node)->A;
+    int n_blocks = ((left_matmul_expr *) node)->n_blocks;
+    block_left_multiply_vec(A, x->value, node->value, n_blocks);
 }
 
 static bool is_affine(const expr *node)
@@ -68,13 +75,14 @@ static void free_type_data(expr *node)
     left_matmul_expr *lin_node = (left_matmul_expr *) node;
     free_csr_matrix(lin_node->A);
     free_csr_matrix(lin_node->AT);
-    if (lin_node->CSC_work)
-    {
-        free_csc_matrix(lin_node->CSC_work);
-    }
+    free_csc_matrix(lin_node->Jchild_CSC);
+    free_csc_matrix(lin_node->J_CSC);
+    free(lin_node->csc_to_csr_workspace);
     lin_node->A = NULL;
     lin_node->AT = NULL;
-    lin_node->CSC_work = NULL;
+    lin_node->Jchild_CSC = NULL;
+    lin_node->J_CSC = NULL;
+    lin_node->csc_to_csr_workspace = NULL;
 }
 
 static void jacobian_init(expr *node)
@@ -82,25 +90,32 @@ static void jacobian_init(expr *node)
     expr *x = node->left;
     left_matmul_expr *lin_node = (left_matmul_expr *) node;
 
-    /* initialize child's jacobian and precompute sparsity of its transpose */
+    /* initialize child's jacobian and precompute sparsity of its CSC */
     x->jacobian_init(x);
-    lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+    lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
 
-    /* precompute sparsity of this node's jacobian */
-    node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work);
+    /* precompute sparsity of this node's jacobian in CSC and CSR */
+    lin_node->J_CSC = block_left_multiply_fill_sparsity(
+        lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks);
+    node->jacobian =
+        csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace);
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+
+    CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC;
+    CSC_Matrix *J_CSC = lnode->J_CSC;
 
     /* evaluate child's jacobian and convert to CSC */
     x->eval_jacobian(x);
-    csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork);
+    csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork);
 
-    /* compute this node's jacobian */
-    csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian);
+    /* compute this node's jacobian: */
+    block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC);
+    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace);
 }
 
 static void wsum_hess_init(expr *node)
@@ -115,15 +130,17 @@ static void wsum_hess_init(expr *node)
     memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));
 
     /* work for computing A^T w*/
-    int A_n = ((left_matmul_expr *) node)->A->n;
-    node->dwork = (double *) malloc(A_n * sizeof(double));
+    int n_blocks = ((left_matmul_expr *) node)->n_blocks;
+    int dim = ((left_matmul_expr *) node)->A->n * n_blocks;
+    node->dwork = (double *) malloc(dim * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
     /* compute A^T w*/
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
-    csr_matvec_wo_offset(lin_node->AT, w, node->dwork);
+    CSR_Matrix *AT = ((left_matmul_expr *) node)->AT;
+    int n_blocks = ((left_matmul_expr *) node)->n_blocks;
+    block_left_multiply_vec(AT, w, node->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->dwork);
     memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
@@ -132,10 +149,10 @@ static void eval_wsum_hess(expr *node, const double *w)
 
 expr *new_left_matmul(expr *u, const CSR_Matrix *A)
 {
-    /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users to do
-       A @ u where u is (n, ) which in C is actually (1, n). In that case the result
-       of A @ u is (m, ), which is (1, m) according to broadcasting rules. We
-       therefore check if this is the case. */
+    /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users
+       to do A @ u where u is (n, ) which in C is actually (1, n). In that case
+       the result of A @ u is (m, ), which is (1, m) according to broadcasting
+       rules. We therefore check if this is the case. */
     int d1, d2, n_blocks;
     if (u->d1 == A->n)
     {
@@ -164,12 +181,17 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A)
     node->left = u;
     expr_retain(u);
 
-    /* Initialize type-specific fields */
-    lin_node->A = block_diag_repeat_csr(A, n_blocks);
-    int alloc = MAX(lin_node->A->n, node->n_vars);
-    node->iwork = (int *) malloc(alloc * sizeof(int));
+    /* allocate workspace. iwork is used for transposing A (requiring size A->n)
+       and for converting J_child csr to csc (requring size node->n_vars).
+       csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size)
+     */
+    node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int));
+    lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
+    lin_node->n_blocks = n_blocks;
+
+    /* store A and AT */
+    lin_node->A = new_csr(A);
     lin_node->AT = transpose(lin_node->A, node->iwork);
-    lin_node->CSC_work = NULL;
 
     return node;
 }
diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c
index 98c08d0..81cb317 100644
--- a/src/bivariate/right_matmul.c
+++ b/src/bivariate/right_matmul.c
@@ -17,6 +17,7 @@
  */
 #include "bivariate.h"
 #include "subexpr.h"
+#include "utils/linalg.h"
 #include <stdlib.h>
 
 /* This file implements the atom 'right_matmul' corresponding to the operation y =
diff --git a/src/problem.c b/src/problem.c
index d9c14a9..aae28bc 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -246,7 +246,7 @@ static inline void print_end_message(const Diff_engine_stats *stats)
 {
     printf("\n"
            "============================================================\n"
-           "              DNLP Differentiation Engine v%s\n"
+           "                SparseDifferentiation v%s\n"
            "  (c) D. Cederberg and W. Zhang, Stanford University, 2026\n"
            "============================================================\n",
            DIFF_ENGINE_VERSION);
diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c
index 2943599..d017a62 100644
--- a/src/utils/CSC_Matrix.c
+++ b/src/utils/CSC_Matrix.c
@@ -17,6 +17,7 @@
  */
 #include "utils/CSC_Matrix.h"
 #include "utils/iVec.h"
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -139,35 +140,6 @@ static inline double sparse_wdot(const double *a_x, const int *a_i, int a_nnz,
     return sum;
 }
 
-/* Unweighted sparse dot product of two sorted index arrays */
-static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz,
-                                const double *b_x, const int *b_i, int b_nnz)
-{
-    int ii = 0;
-    int jj = 0;
-    double sum = 0.0;
-
-    while (ii < a_nnz && jj < b_nnz)
-    {
-        if (a_i[ii] == b_i[jj])
-        {
-            sum += a_x[ii] * b_x[jj];
-            ii++;
-            jj++;
-        }
-        else if (a_i[ii] < b_i[jj])
-        {
-            ii++;
-        }
-        else
-        {
-            jj++;
-        }
-    }
-
-    return sum;
-}
-
 void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C)
 {
     int j, ii, jj;
@@ -306,6 +278,72 @@ void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork)
     }
 }
 
+CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork)
+{
+    CSR_Matrix *C = new_csr_matrix(A->m, A->n, A->nnz);
+
+    int i, j;
+    int *count = iwork;
+    memset(count, 0, A->m * sizeof(int));
+
+    // -------------------------------------------------------------------
+    //              compute nnz in each row of A, store in count
+    // -------------------------------------------------------------------
+    for (i = 0; i < A->n; ++i)
+    {
+        for (j = A->p[i]; j < A->p[i + 1]; ++j)
+        {
+            assert(A->i[j] < A->m);
+            count[A->i[j]]++;
+        }
+    }
+
+    // ------------------------------------------------------------------
+    //                      compute row pointers
+    // ------------------------------------------------------------------
+    C->p[0] = 0;
+    for (i = 0; i < A->m; ++i)
+    {
+        C->p[i + 1] = C->p[i] + count[i];
+        count[i] = C->p[i];
+    }
+
+    // ------------------------------------------------------------------
+    //                         fill column indices
+    // ------------------------------------------------------------------
+    for (i = 0; i < A->n; ++i)
+    {
+        for (j = A->p[i]; j < A->p[i + 1]; ++j)
+        {
+            assert(A->i[j] < A->m);
+            C->i[count[A->i[j]]] = i;
+            count[A->i[j]]++;
+        }
+    }
+
+    return C;
+}
+
+void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork)
+{
+    int i, j;
+    int *count = iwork;
+    memcpy(count, C->p, A->m * sizeof(int));
+
+    // ------------------------------------------------------------------
+    //                         fill values
+    // ------------------------------------------------------------------
+    for (i = 0; i < A->n; ++i)
+    {
+        for (j = A->p[i]; j < A->p[i + 1]; ++j)
+        {
+            assert(A->i[j] < A->m);
+            C->x[count[A->i[j]]] = A->x[j];
+            count[A->i[j]]++;
+        }
+    }
+}
+
 CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
 {
     /* A is m x n, B is m x p, C = B^T A is p x n */
@@ -413,88 +451,3 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
         }
     }
 }
-
-static bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len)
-{
-    int ai = 0, bi = 0;
-    while (ai < a_len && bi < b_len)
-    {
-        if (a_idx[ai] == b_idx[bi]) return true;
-        if (a_idx[ai] < b_idx[bi])
-        {
-            ai++;
-        }
-        else
-        {
-            bi++;
-        }
-    }
-    return false;
-}
-
-/* Fill values of C = A @ B where A is CSR, B is CSC. */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C)
-{
-    for (int i = 0; i < A->m; i++)
-    {
-        for (int jj = C->p[i]; jj < C->p[i + 1]; jj++)
-        {
-            int j = C->i[jj];
-
-            int a_nnz = A->p[i + 1] - A->p[i];
-            int b_nnz = B->p[j + 1] - B->p[j];
-
-            /* Compute dot product of row i of A and column j of B */
-            double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz,
-                                    B->x + B->p[j], B->i + B->p[j], b_nnz);
-
-            C->x[jj] = sum;
-        }
-    }
-}
-
-/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p)
-  with precomputed sparsity pattern */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
-{
-    int m = A->m;
-    int p = B->n;
-
-    int len_a, len_b;
-
-    int *Cp = (int *) malloc((m + 1) * sizeof(int));
-    iVec *Ci = iVec_new(m);
-
-    Cp[0] = 0;
-
-    // --------------------------------------------------------------
-    //            countz nnz and fill column indices
-    // --------------------------------------------------------------
-    int nnz = 0;
-    for (int i = 0; i < A->m; i++)
-    {
-        len_a = A->p[i + 1] - A->p[i];
-
-        for (int j = 0; j < B->n; j++)
-        {
-            len_b = B->p[j + 1] - B->p[j];
-
-            if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b))
-            {
-                iVec_append(Ci, j);
-                nnz++;
-            }
-        }
-
-        Cp[i + 1] = nnz;
-    }
-
-    CSR_Matrix *C = new_csr_matrix(m, p, nnz);
-    memcpy(C->p, Cp, (m + 1) * sizeof(int));
-    memcpy(C->i, Ci->data, nnz * sizeof(int));
-    free(Cp);
-    iVec_free(Ci);
-
-    return C;
-}
diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_Matrix.c
index 8b0635b..fe6688a 100644
--- a/src/utils/CSR_Matrix.c
+++ b/src/utils/CSR_Matrix.c
@@ -36,6 +36,15 @@ CSR_Matrix *new_csr_matrix(int m, int n, int nnz)
     return matrix;
 }
 
+CSR_Matrix *new_csr(const CSR_Matrix *A)
+{
+    CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
+    memcpy(copy->i, A->i, A->nnz * sizeof(int));
+    memcpy(copy->x, A->x, A->nnz * sizeof(double));
+    return copy;
+}
+
 void free_csr_matrix(CSR_Matrix *matrix)
 {
     if (matrix)
diff --git a/src/utils/linalg.c b/src/utils/linalg.c
new file mode 100644
index 0000000..82c03e0
--- /dev/null
+++ b/src/utils/linalg.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/CSC_Matrix.h"
+#include "utils/CSR_Matrix.h"
+#include "utils/iVec.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static inline bool has_overlap(const int *a_idx, int a_len, const int *b_idx,
+                               int b_len, int b_offset)
+{
+    int ai = 0, bi = 0;
+    while (ai < a_len && bi < b_len)
+    {
+        if (a_idx[ai] == b_idx[bi] - b_offset) return true;
+        if (a_idx[ai] < b_idx[bi] - b_offset)
+        {
+            ai++;
+        }
+        else
+        {
+            bi++;
+        }
+    }
+    return false;
+}
+
+/* Unweighted sparse dot product of two sorted index arrays */
+static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz,
+                                const double *b_x, const int *b_i, int b_nnz,
+                                int b_offset)
+{
+    int ii = 0;
+    int jj = 0;
+    double sum = 0.0;
+
+    while (ii < a_nnz && jj < b_nnz)
+    {
+        if (a_i[ii] == b_i[jj] - b_offset)
+        {
+            sum += a_x[ii] * b_x[jj];
+            ii++;
+            jj++;
+        }
+        else if (a_i[ii] < b_i[jj] - b_offset)
+        {
+            ii++;
+        }
+        else
+        {
+            jj++;
+        }
+    }
+
+    return sum;
+}
+
+static inline double sparse_dot_offset(const double *a_x, const int *a_idx,
+                                       int a_nnz, const double *b_x,
+                                       const int *b_idx, int b_nnz, int b_offset)
+{
+    int ii = 0, jj = 0;
+    double sum = 0.0;
+
+    while (ii < a_nnz && jj < b_nnz)
+    {
+        int b_col = b_idx[jj] - b_offset;
+
+        if (a_idx[ii] == b_col)
+        {
+            sum += a_x[ii] * b_x[jj];
+            ii++;
+            jj++;
+        }
+        else if (a_idx[ii] < b_col)
+        {
+            ii++;
+        }
+        else
+        {
+            jj++;
+        }
+    }
+
+    return sum;
+}
+
+CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
+                                              const CSC_Matrix *J, int p)
+{
+    /* A is m x n, J is (n*p) x k, C is (m*p) x k */
+    int m = A->m;
+    int n = A->n;
+    int k = J->n;
+    int j, jj, block, block_start, block_end, block_jj_start, block_jj_end,
+        row_offset;
+
+    /* allocate column pointers and an estimate of row indices */
+    int *Cp = (int *) malloc((k + 1) * sizeof(int));
+    iVec *Ci = iVec_new(k * m);
+    Cp[0] = 0;
+
+    /* for each column of j */
+    for (j = 0; j < J->n; j++)
+    {
+        /* if empty we continue */
+        if (J->p[j] == J->p[j + 1])
+        {
+            Cp[j + 1] = Cp[j];
+            continue;
+        }
+
+        /* process each of p blocks of rows in this column of J */
+        jj = J->p[j];
+        for (block = 0; block < p; block++)
+        {
+
+            // -----------------------------------------------------------------
+            //  find start and end indices of rows of J in this block
+            // -----------------------------------------------------------------
+            block_start = block * n;
+            block_end = block_start + n;
+            while (jj < J->p[j + 1] && J->i[jj] < block_start)
+            {
+                jj++;
+            }
+
+            block_jj_start = jj;
+
+            while (jj < J->p[j + 1] && J->i[jj] < block_end)
+            {
+                jj++;
+            }
+
+            block_jj_end = jj;
+            int nnz_in_block = block_jj_end - block_jj_start;
+            if (nnz_in_block == 0)
+            {
+                continue;
+            }
+
+            // -----------------------------------------------------------------
+            // check which rows of A overlap with the column indices of J in this
+            // block
+            // -----------------------------------------------------------------
+            row_offset = block * m;
+            for (int i = 0; i < A->m; i++)
+            {
+                int a_len = A->p[i + 1] - A->p[i];
+                if (has_overlap(A->i + A->p[i], a_len, J->i + block_jj_start,
+                                nnz_in_block, block_start))
+                {
+                    iVec_append(Ci, row_offset + i);
+                }
+            }
+        }
+        Cp[j + 1] = Ci->len;
+    }
+
+    /* Allocate result matrix C in CSC format */
+    CSC_Matrix *C = new_csc_matrix(m * p, k, Ci->len);
+
+    /* Copy column pointers and row indices */
+    memcpy(C->p, Cp, (k + 1) * sizeof(int));
+    memcpy(C->i, Ci->data, Ci->len * sizeof(int));
+
+    /* Clean up workspace */
+    free(Cp);
+    iVec_free(Ci);
+
+    return C;
+}
+
+void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
+                                     CSC_Matrix *C)
+{
+    /* A is m x n, J is (n*p) x k, C is (m*p) x k */
+    int m = A->m;
+    int n = A->n;
+    int k = J->n;
+
+    int i, j, row_a, block, block_start, block_end, start, end;
+
+    /* to get rid of unitialized warnings */
+    block = 0;
+    block_start = 0;
+    block_end = 0;
+    start = 0;
+    end = 0;
+
+    /* for each column of J (and C) */
+    for (j = 0; j < k; j++)
+    {
+        int previous_block = -1;
+
+        for (i = C->p[j]; i < C->p[j + 1]; i++)
+        {
+            /* choose row of A and block of column of J */
+            row_a = C->i[i] % m;
+            block = C->i[i] / m;
+
+            // -------------------------------------------------------------------------
+            //          find the part of the column of J in the current block
+            // -------------------------------------------------------------------------
+            if (block != previous_block)
+            {
+                previous_block = block;
+                block_start = block * n;
+                block_end = block_start + n;
+                start = J->p[j];
+                end = J->p[j + 1];
+
+                while (start < J->p[j + 1] && J->i[start] < block_start)
+                {
+                    start++;
+                }
+
+                while (end > start && J->i[end - 1] >= block_end)
+                {
+                    end--;
+                }
+            }
+
+            // ------------------------------------------------------------------------------
+            // compute value as sparse dot product of row of A and column of J in
+            // this block
+            // ------------------------------------------------------------------------------
+            int a_len = A->p[row_a + 1] - A->p[row_a];
+            C->x[i] =
+                sparse_dot(A->x + A->p[row_a], A->i + A->p[row_a], a_len,
+                           J->x + start, J->i + start, end - start, block_start);
+        }
+    }
+}
+
+/* Fill values of C = A @ B where A is CSR, B is CSC. */
+void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
+                                CSR_Matrix *C)
+{
+    for (int i = 0; i < A->m; i++)
+    {
+        for (int jj = C->p[i]; jj < C->p[i + 1]; jj++)
+        {
+            int j = C->i[jj];
+
+            int a_nnz = A->p[i + 1] - A->p[i];
+            int b_nnz = B->p[j + 1] - B->p[j];
+
+            /* Compute dot product of row i of A and column j of B */
+            double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz,
+                                    B->x + B->p[j], B->i + B->p[j], b_nnz, 0);
+
+            C->x[jj] = sum;
+        }
+    }
+}
+
+/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p)
+  with precomputed sparsity pattern */
+CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
+{
+    int m = A->m;
+    int p = B->n;
+
+    int len_a, len_b;
+
+    int *Cp = (int *) malloc((m + 1) * sizeof(int));
+    iVec *Ci = iVec_new(m);
+
+    Cp[0] = 0;
+
+    // --------------------------------------------------------------
+    //            count nnz and fill column indices
+    // --------------------------------------------------------------
+    int nnz = 0;
+    for (int i = 0; i < A->m; i++)
+    {
+        len_a = A->p[i + 1] - A->p[i];
+
+        for (int j = 0; j < B->n; j++)
+        {
+            len_b = B->p[j + 1] - B->p[j];
+
+            if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b, 0))
+            {
+                iVec_append(Ci, j);
+                nnz++;
+            }
+        }
+
+        Cp[i + 1] = nnz;
+    }
+
+    CSR_Matrix *C = new_csr_matrix(m, p, nnz);
+    memcpy(C->p, Cp, (m + 1) * sizeof(int));
+    memcpy(C->i, Ci->data, nnz * sizeof(int));
+    free(Cp);
+    iVec_free(Ci);
+
+    return C;
+}
+
+/* Compute block-wise matrix-vector products.
+ * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is (n*p)-length vector.
+ * x is split into p blocks of n elements each.
+ */
+void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
+                             int p)
+{
+    /* For each block */
+    for (int block = 0; block < p; block++)
+    {
+        int block_start = block * A->n;
+        int y_offset = block * A->m;
+
+        /* For each row of A */
+        for (int i = 0; i < A->m; i++)
+        {
+            double row_sum = 0.0;
+            int row_nnz = A->p[i + 1] - A->p[i];
+
+            /* Compute sparse dot product of A[i,:] with x_block */
+            for (int idx = 0; idx < row_nnz; idx++)
+            {
+                int col = A->i[A->p[i] + idx];
+                row_sum += A->x[A->p[i] + idx] * x[block_start + col];
+            }
+
+            y[y_offset + i] = row_sum;
+        }
+    }
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 4e7571a..3b23e45 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -3,6 +3,7 @@
 #include "minunit.h"
 
 /* Include all test headers */
+#ifndef PROFILE_ONLY
 #include "forward_pass/affine/test_add.h"
 #include "forward_pass/affine/test_broadcast.h"
 #include "forward_pass/affine/test_hstack.h"
@@ -43,7 +44,9 @@
 #include "jacobian_tests/test_transpose.h"
 #include "problem/test_problem.h"
 #include "utils/test_csc_matrix.h"
+#include "utils/test_csr_csc_conversion.h"
 #include "utils/test_csr_matrix.h"
+#include "utils/test_linalg.h"
 #include "wsum_hess/elementwise/test_entr.h"
 #include "wsum_hess/elementwise/test_exp.h"
 #include "wsum_hess/elementwise/test_hyperbolic.h"
@@ -72,6 +75,11 @@
 #include "wsum_hess/test_sum.h"
 #include "wsum_hess/test_trace.h"
 #include "wsum_hess/test_transpose.h"
+#endif /* PROFILE_ONLY */
+
+#ifdef PROFILE_ONLY
+#include "profiling/profile_left_matmul.h"
+#endif /* PROFILE_ONLY */
 
 int main(void)
 {
@@ -79,6 +87,7 @@ int main(void)
 
     int tests_run = 0;
 
+#ifndef PROFILE_ONLY
     printf("--- Forward Pass Tests ---\n");
     mu_run_test(test_variable, tests_run);
     mu_run_test(test_constant, tests_run);
@@ -153,6 +162,7 @@ int main(void)
     mu_run_test(test_broadcast_row_jacobian, tests_run);
     mu_run_test(test_broadcast_col_jacobian, tests_run);
     mu_run_test(test_broadcast_scalar_to_matrix_jacobian, tests_run);
+    mu_run_test(test_double_broadcast, tests_run);
     mu_run_test(test_wsum_hess_multiply_1, tests_run);
     mu_run_test(test_wsum_hess_multiply_2, tests_run);
     mu_run_test(test_jacobian_trace_variable, tests_run);
@@ -239,6 +249,19 @@ int main(void)
     mu_run_test(test_kron_identity_csr, tests_run);
     mu_run_test(test_csr_to_csc1, tests_run);
     mu_run_test(test_csr_to_csc2, tests_run);
+    mu_run_test(test_csr_to_csc_split, tests_run);
+    mu_run_test(test_csc_to_csr_sparsity, tests_run);
+    mu_run_test(test_csc_to_csr_values, tests_run);
+    mu_run_test(test_csr_csc_csr_roundtrip, tests_run);
+    mu_run_test(test_block_left_multiply_single_block, tests_run);
+    mu_run_test(test_block_left_multiply_two_blocks, tests_run);
+    mu_run_test(test_block_left_multiply_zero_column, tests_run);
+    mu_run_test(test_csr_csc_matmul_alloc_basic, tests_run);
+    mu_run_test(test_csr_csc_matmul_alloc_sparse, tests_run);
+    mu_run_test(test_block_left_multiply_vec_single_block, tests_run);
+    mu_run_test(test_block_left_multiply_vec_two_blocks, tests_run);
+    mu_run_test(test_block_left_multiply_vec_sparse, tests_run);
+    mu_run_test(test_block_left_multiply_vec_three_blocks, tests_run);
     mu_run_test(test_csr_vecmat_values_sparse, tests_run);
     mu_run_test(test_sum_all_rows_csr, tests_run);
     mu_run_test(test_sum_block_of_rows_csr, tests_run);
@@ -257,6 +280,12 @@ int main(void)
     mu_run_test(test_problem_jacobian_multi, tests_run);
     mu_run_test(test_problem_constraint_forward, tests_run);
     mu_run_test(test_problem_hessian, tests_run);
+#endif /* PROFILE_ONLY */
+
+#ifdef PROFILE_ONLY
+    printf("\n--- Profiling Tests ---\n");
+    mu_run_test(profile_left_matmul, tests_run);
+#endif /* PROFILE_ONLY */
 
     printf("\n=== All %d tests passed ===\n", tests_run);
 
diff --git a/tests/jacobian_tests/test_broadcast.h b/tests/jacobian_tests/test_broadcast.h
index 43872fe..612e5cf 100644
--- a/tests/jacobian_tests/test_broadcast.h
+++ b/tests/jacobian_tests/test_broadcast.h
@@ -133,3 +133,35 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void)
     free_expr(bcast);
     return 0;
 }
+
+const char *test_double_broadcast(void)
+{
+    double x_vals[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
+    double b_vals[5] = {10.0, 20.0, 30.0, 40.0, 50.0};
+
+    /* form the expression x + b */
+    expr *x = new_variable(5, 1, 0, 5);
+    expr *b = new_constant(1, 5, 5, b_vals);
+    expr *bcast_x = new_broadcast(x, 5, 5);
+    expr *bcast_b = new_broadcast(b, 5, 5);
+    expr *sum = new_add(bcast_x, bcast_b);
+
+    sum->forward(sum, x_vals);
+    sum->jacobian_init(sum);
+    sum->eval_jacobian(sum);
+
+    /* All 6 elements depend on the single input variable */
+    // double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    // int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
+    // int expected_i[6] = {0, 0, 0, 0, 0, 0};
+    //
+    // mu_assert("broadcast scalar jacobian vals fail",
+    //          cmp_double_array(sum->jacobian->x, expected_x, 6));
+    // mu_assert("broadcast scalar jacobian rows fail",
+    //          cmp_int_array(sum ->jacobian->p, expected_p, 7));
+    // mu_assert("broadcast scalar jacobian cols fail",
+    //          cmp_int_array(bcast->jacobian->i, expected_i, 6));
+
+    free_expr(sum);
+    return 0;
+}
diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h
new file mode 100644
index 0000000..a8e819b
--- /dev/null
+++ b/tests/profiling/profile_left_matmul.h
@@ -0,0 +1,62 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "affine.h"
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+#include "utils/Timer.h"
+
+const char *profile_left_matmul()
+{
+    /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */
+    int n = 100;
+    expr *X = new_variable(n, n, 0, n * n);
+    CSR_Matrix *A = new_csr_matrix(n, n, n * n);
+    for (int i = 0; i < n * n; i++)
+    {
+        A->x[i] = 1.0; /* dense matrix of all ones */
+    }
+    for (int row = 0; row < n; row++)
+    {
+        A->p[row] = row * n;
+        for (int col = 0; col < n; col++)
+        {
+            A->i[row * n + col] = col;
+        }
+    }
+    A->p[n] = n * n;
+
+    expr *AX = new_left_matmul(X, A);
+
+    double *x_vals = (double *) malloc(n * n * sizeof(double));
+    for (int i = 0; i < n * n; i++)
+    {
+        x_vals[i] = 1.0;
+    }
+
+    Timer timer;
+    clock_gettime(CLOCK_MONOTONIC, &timer.start);
+    AX->forward(AX, x_vals);
+    clock_gettime(CLOCK_MONOTONIC, &timer.end);
+    printf("left_matmul forward time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer));
+    clock_gettime(CLOCK_MONOTONIC, &timer.start);
+    AX->jacobian_init(AX);
+    clock_gettime(CLOCK_MONOTONIC, &timer.end);
+    printf("left_matmul jacobian init time: %8.3f seconds\n",
+           GET_ELAPSED_SECONDS(timer));
+    clock_gettime(CLOCK_MONOTONIC, &timer.start);
+    AX->eval_jacobian(AX);
+    clock_gettime(CLOCK_MONOTONIC, &timer.end);
+    printf("left_matmul jacobian eval time: %8.3f seconds\n",
+           GET_ELAPSED_SECONDS(timer));
+
+    free(x_vals);
+    free_csr_matrix(A);
+    free_expr(AX);
+    return 0;
+}
diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h
new file mode 100644
index 0000000..c7daeb6
--- /dev/null
+++ b/tests/utils/test_csr_csc_conversion.h
@@ -0,0 +1,171 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "minunit.h"
+#include "test_helpers.h"
+#include "utils/CSC_Matrix.h"
+#include "utils/CSR_Matrix.h"
+
+/* Test CSR to CSC conversion with fill_sparsity and fill_values */
+const char *test_csr_to_csc_split()
+{
+    /* Create a 4x5 CSR matrix A:
+     * [1.0  0.0  0.0  0.0  1.0]
+     * [0.0  0.0  3.0  0.0  0.0]
+     * [0.0  2.0  0.0  0.0  0.0]
+     * [0.0  0.0  0.0  4.0  0.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
+    int Ai[5] = {0, 4, 2, 1, 3};
+    int Ap[5] = {0, 2, 3, 4, 5};
+    memcpy(A->x, Ax, 5 * sizeof(double));
+    memcpy(A->i, Ai, 5 * sizeof(int));
+    memcpy(A->p, Ap, 5 * sizeof(int));
+
+    /* Allocate workspace */
+    int *iwork = (int *) malloc(A->n * sizeof(int));
+
+    /* First, fill sparsity pattern */
+    CSC_Matrix *C = csr_to_csc_fill_sparsity(A, iwork);
+
+    /* Check sparsity pattern */
+    int Cp_correct[6] = {0, 1, 2, 3, 4, 5};
+    int Ci_correct[5] = {0, 2, 1, 3, 0};
+
+    mu_assert("C col pointers incorrect", cmp_int_array(C->p, Cp_correct, 6));
+    mu_assert("C row indices incorrect", cmp_int_array(C->i, Ci_correct, 5));
+
+    /* Now fill values */
+    csr_to_csc_fill_values(A, C, iwork);
+
+    /* Check values */
+    double Cx_correct[5] = {1.0, 2.0, 3.0, 4.0, 1.0};
+
+    mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5));
+
+    free(iwork);
+    free_csr_matrix(A);
+    free_csc_matrix(C);
+
+    return 0;
+}
+
+/* Test CSC to CSR conversion with fill_sparsity */
+const char *test_csc_to_csr_sparsity()
+{
+    /* Create a 4x5 CSC matrix A:
+     * [1.0  0.0  0.0  0.0  2.0]
+     * [0.0  0.0  3.0  0.0  0.0]
+     * [0.0  4.0  0.0  0.0  0.0]
+     * [0.0  0.0  0.0  5.0  0.0]
+     */
+    CSC_Matrix *A = new_csc_matrix(4, 5, 5);
+    double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
+    int Ai[5] = {0, 2, 1, 3, 0};
+    int Ap[6] = {0, 1, 2, 3, 4, 5};
+    memcpy(A->x, Ax, 5 * sizeof(double));
+    memcpy(A->i, Ai, 5 * sizeof(int));
+    memcpy(A->p, Ap, 6 * sizeof(int));
+
+    /* Allocate workspace */
+    int *iwork = (int *) malloc(A->m * sizeof(int));
+
+    /* Fill sparsity pattern */
+    CSR_Matrix *C = csc_to_csr_fill_sparsity(A, iwork);
+
+    /* Expected CSR format:
+     * Row 0: [1.0 at col 0, 2.0 at col 4]
+     * Row 1: [3.0 at col 2]
+     * Row 2: [4.0 at col 1]
+     * Row 3: [5.0 at col 3]
+     */
+    int Cp_correct[5] = {0, 2, 3, 4, 5};
+    int Ci_correct[5] = {0, 4, 2, 1, 3};
+
+    mu_assert("C row pointers incorrect", cmp_int_array(C->p, Cp_correct, 5));
+    mu_assert("C col indices incorrect", cmp_int_array(C->i, Ci_correct, 5));
+    mu_assert("C dimensions incorrect", C->m == 4 && C->n == 5);
+    mu_assert("C nnz incorrect", C->nnz == 5);
+
+    free(iwork);
+    free_csc_matrix(A);
+    free_csr_matrix(C);
+
+    return 0;
+}
+
+/* Test CSC to CSR conversion with fill_values */
+const char *test_csc_to_csr_values()
+{
+    /* Create a 4x5 CSC matrix A */
+    CSC_Matrix *A = new_csc_matrix(4, 5, 5);
+    double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
+    int Ai[5] = {0, 2, 1, 3, 0};
+    int Ap[6] = {0, 1, 2, 3, 4, 5};
+    memcpy(A->x, Ax, 5 * sizeof(double));
+    memcpy(A->i, Ai, 5 * sizeof(int));
+    memcpy(A->p, Ap, 6 * sizeof(int));
+
+    /* Allocate workspace */
+    int *iwork = (int *) malloc(A->m * sizeof(int));
+
+    /* Fill sparsity pattern */
+    CSR_Matrix *C = csc_to_csr_fill_sparsity(A, iwork);
+
+    /* Fill values */
+    csc_to_csr_fill_values(A, C, iwork);
+
+    /* Check values */
+    double Cx_correct[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
+
+    mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5));
+
+    free(iwork);
+    free_csc_matrix(A);
+    free_csr_matrix(C);
+
+    return 0;
+}
+
+/* Test round-trip conversion: CSR -> CSC -> CSR */
+const char *test_csr_csc_csr_roundtrip()
+{
+    /* Create a 3x4 CSR matrix A:
+     * [1.0  2.0  0.0  3.0]
+     * [0.0  4.0  5.0  0.0]
+     * [6.0  0.0  7.0  8.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(3, 4, 8);
+    double Ax[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+    int Ai[8] = {0, 1, 3, 1, 2, 0, 2, 3};
+    int Ap[4] = {0, 3, 5, 8};
+    memcpy(A->x, Ax, 8 * sizeof(double));
+    memcpy(A->i, Ai, 8 * sizeof(int));
+    memcpy(A->p, Ap, 4 * sizeof(int));
+
+    /* Convert CSR to CSC */
+    int *iwork_csc = (int *) malloc(A->n * sizeof(int));
+    CSC_Matrix *B = csr_to_csc_fill_sparsity(A, iwork_csc);
+    csr_to_csc_fill_values(A, B, iwork_csc);
+
+    /* Convert CSC back to CSR */
+    int *iwork_csr = (int *) malloc(B->m * sizeof(int));
+    CSR_Matrix *C = csc_to_csr_fill_sparsity(B, iwork_csr);
+    csc_to_csr_fill_values(B, C, iwork_csr);
+
+    /* C should match A */
+    mu_assert("Round-trip: vals incorrect", cmp_double_array(C->x, Ax, 8));
+    mu_assert("Round-trip: col indices incorrect", cmp_int_array(C->i, Ai, 8));
+    mu_assert("Round-trip: row pointers incorrect", cmp_int_array(C->p, Ap, 4));
+
+    free(iwork_csc);
+    free(iwork_csr);
+    free_csr_matrix(A);
+    free_csc_matrix(B);
+    free_csr_matrix(C);
+
+    return 0;
+}
diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h
new file mode 100644
index 0000000..e7a6896
--- /dev/null
+++ b/tests/utils/test_linalg.h
@@ -0,0 +1,398 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "minunit.h"
+#include "test_helpers.h"
+#include "utils/CSC_Matrix.h"
+#include "utils/CSR_Matrix.h"
+#include "utils/linalg.h"
+
+/* Test block_left_multiply_fill_sparsity with simple case: single block */
+const char *test_block_left_multiply_single_block()
+{
+    /* A is 2x3 CSR:
+     * [1.0  0.0  0.0]
+     * [0.0  1.0  1.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 3);
+    double Ax[3] = {1.0, 1.0, 1.0};
+    int Ai[3] = {0, 1, 2};
+    int Ap[3] = {0, 1, 3};
+    memcpy(A->x, Ax, 3 * sizeof(double));
+    memcpy(A->i, Ai, 3 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* J is 3x2 CSC (single block, so p=1):
+     * [1.0  0.0]
+     * [1.0  0.0]
+     * [0.0  1.0]
+     */
+    CSC_Matrix *J = new_csc_matrix(3, 2, 3);
+    double Jx[3] = {1.0, 1.0, 1.0};
+    int Ji[3] = {0, 1, 2};
+    int Jp[3] = {0, 2, 3};
+    memcpy(J->x, Jx, 3 * sizeof(double));
+    memcpy(J->i, Ji, 3 * sizeof(int));
+    memcpy(J->p, Jp, 3 * sizeof(int));
+
+    /* Compute C = A @ J1 (p=1 means just one block) */
+    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
+
+    /* Expected C is 2x2:
+     * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has
+     * row 0) C[1,0] = A[1,:] @ J[:,0] = 1.0*1.0 + 1.0*0.0 = 1.0 (row 1 has columns
+     * 1,2, J col 0 has row 1) C[0,1] = A[0,:] @ J[:,1] = 0.0 (row 0 has column 0, J
+     * col 1 has row 2) C[1,1] = A[1,:] @ J[:,1] = 1.0*1.0 = 1.0 (row 1 has columns
+     * 1,2, J col 1 has row 2)
+     */
+    int expected_p[3] = {0, 2, 3};
+    int expected_i[3] = {0, 1, 1};
+
+    mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 3);
+    mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p, 3));
+    mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i, 3));
+
+    free_csc_matrix(C);
+    free_csr_matrix(A);
+    free_csc_matrix(J);
+    return NULL;
+}
+
+/* Test block_left_multiply_fill_sparsity with two blocks */
+const char *test_block_left_multiply_two_blocks()
+{
+    /* A is 2x2 CSR:
+     * [1.0  0.0]
+     * [0.0  1.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 2, 2);
+    double Ax[2] = {1.0, 1.0};
+    int Ai[2] = {0, 1};
+    int Ap[3] = {0, 1, 2};
+    memcpy(A->x, Ax, 2 * sizeof(double));
+    memcpy(A->i, Ai, 2 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* J is 4x3 CSC (two blocks of 2 rows each):
+     * Block 1 rows [0,1]:
+     * [1.0  0.0  0.0]
+     * [0.0  0.0  0.0]
+     * Block 2 rows [2,3]:
+     * [0.0  1.0  0.0]
+     * [0.0  0.0  1.0]
+     * So J is:
+     * [1.0  0.0  0.0]
+     * [0.0  0.0  0.0]
+     * [0.0  1.0  0.0]
+     * [0.0  0.0  1.0]
+     */
+    CSC_Matrix *J = new_csc_matrix(4, 3, 3);
+    double Jx[3] = {1.0, 1.0, 1.0};
+    int Ji[3] = {0, 2, 3};
+    int Jp[4] = {0, 1, 2, 3};
+    memcpy(J->x, Jx, 3 * sizeof(double));
+    memcpy(J->i, Ji, 3 * sizeof(int));
+    memcpy(J->p, Jp, 4 * sizeof(int));
+
+    /* Compute C = [A @ J1; A @ J2] where:
+     * J1 = [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
+     * J2 = [[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
+     *
+     * C = [A @ J1; A @ J2] is 4x3:
+     * A @ J1 = [[1, 0, 0], [0, 0, 0]] (row 0 of A matches col 0 of J1)
+     * A @ J2 = [[0, 0, 0], [0, 1, 1]] (row 1 of A matches cols 1,2 of J2)
+     * So C is:
+     * [1.0  0.0  0.0]
+     * [0.0  0.0  0.0]
+     * [0.0  0.0  0.0]
+     * [0.0  1.0  1.0]
+     */
+    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 2);
+    block_left_multiply_fill_values(A, J, C);
+
+    int expected_p2[4] = {0, 1, 2, 3};
+    int expected_i2[3] = {0, 2, 3};
+    double expected_x2[3] = {1.0, 1.0, 1.0};
+
+    mu_assert("C dims incorrect", C->m == 4 && C->n == 3 && C->nnz == 3);
+    mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p2, 4));
+    mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i2, 3));
+    mu_assert("C values incorrect", cmp_double_array(C->x, expected_x2, 3));
+
+    free_csc_matrix(C);
+    free_csr_matrix(A);
+    free_csc_matrix(J);
+    return NULL;
+}
+
+/* Test block_left_multiply_fill_sparsity with all zero column in J */
+const char *test_block_left_multiply_zero_column()
+{
+    /* A is 2x2 CSR (identity) */
+    CSR_Matrix *A = new_csr_matrix(2, 2, 2);
+    double Ax[2] = {1.0, 1.0};
+    int Ai[2] = {0, 1};
+    int Ap[3] = {0, 1, 2};
+    memcpy(A->x, Ax, 2 * sizeof(double));
+    memcpy(A->i, Ai, 2 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* J is 2x2 with an empty column:
+     * [1.0  0.0]
+     * [0.0  0.0]
+     */
+    CSC_Matrix *J = new_csc_matrix(2, 2, 1);
+    double Jx[1] = {1.0};
+    int Ji[1] = {0};
+    int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */
+    memcpy(J->x, Jx, 1 * sizeof(double));
+    memcpy(J->i, Ji, 1 * sizeof(int));
+    memcpy(J->p, Jp, 3 * sizeof(int));
+
+    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
+
+    int expected_p3[3] = {0, 1, 1};
+    int expected_i3[1] = {0};
+
+    mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 1);
+    mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p3, 3));
+    mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i3, 1));
+
+    free_csc_matrix(C);
+    free_csr_matrix(A);
+    free_csc_matrix(J);
+    return NULL;
+}
+
+/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR and B is CSC */
+const char *test_csr_csc_matmul_alloc_basic()
+{
+    /* A is 3x2 CSR:
+     * [1.0  0.0]
+     * [0.0  1.0]
+     * [1.0  1.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    double Ax[4] = {1.0, 1.0, 1.0, 1.0};
+    int Ai[4] = {0, 1, 0, 1};
+    int Ap[4] = {0, 1, 2, 4};
+    memcpy(A->x, Ax, 4 * sizeof(double));
+    memcpy(A->i, Ai, 4 * sizeof(int));
+    memcpy(A->p, Ap, 4 * sizeof(int));
+
+    /* B is 2x3 CSC:
+     * [1.0  0.0  1.0]
+     * [0.0  1.0  1.0]
+     */
+    CSC_Matrix *B = new_csc_matrix(2, 3, 4);
+    double Bx[4] = {1.0, 1.0, 1.0, 1.0};
+    int Bi[4] = {0, 1, 0, 1};
+    int Bp[4] = {0, 1, 2, 4};
+    memcpy(B->x, Bx, 4 * sizeof(double));
+    memcpy(B->i, Bi, 4 * sizeof(int));
+    memcpy(B->p, Bp, 4 * sizeof(int));
+
+    /* C = A @ B is 3x3:
+     * C = [[1, 0, 1],
+     *      [0, 1, 1],
+     *      [1, 1, 2]]
+     */
+    CSR_Matrix *C = csr_csc_matmul_alloc(A, B);
+
+    int expected_p4[4] = {0, 2, 4, 7};
+    int expected_i4[7] = {0, 2, 1, 2, 0, 1, 2};
+
+    mu_assert("C dims incorrect", C->m == 3 && C->n == 3 && C->nnz == 7);
+    mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p4, 4));
+    mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i4, 7));
+
+    free_csr_matrix(C);
+    free_csr_matrix(A);
+    free_csc_matrix(B);
+    return NULL;
+}
+
+/* Test csr_csc_matmul_alloc with sparse result */
+const char *test_csr_csc_matmul_alloc_sparse()
+{
+    /* A is 2x3 CSR:
+     * [1.0  0.0  0.0]
+     * [0.0  0.0  1.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 2);
+    double Ax[2] = {1.0, 1.0};
+    int Ai[2] = {0, 2};
+    int Ap[3] = {0, 1, 2};
+    memcpy(A->x, Ax, 2 * sizeof(double));
+    memcpy(A->i, Ai, 2 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* B is 3x2 CSC:
+     * [1.0  0.0]
+     * [0.0  0.0]
+     * [0.0  1.0]
+     */
+    CSC_Matrix *B = new_csc_matrix(3, 2, 2);
+    double Bx[2] = {1.0, 1.0};
+    int Bi[2] = {0, 2};
+    int Bp[3] = {0, 1, 2};
+    memcpy(B->x, Bx, 2 * sizeof(double));
+    memcpy(B->i, Bi, 2 * sizeof(int));
+    memcpy(B->p, Bp, 3 * sizeof(int));
+
+    /* C = A @ B is 2x2:
+     * C = [[1, 0],
+     *      [0, 1]]
+     */
+    CSR_Matrix *C = csr_csc_matmul_alloc(A, B);
+
+    int expected_p5[3] = {0, 1, 2};
+    int expected_i5[2] = {0, 1};
+
+    mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 2);
+    mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p5, 3));
+    mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i5, 2));
+
+    free_csr_matrix(C);
+    free_csr_matrix(A);
+    free_csc_matrix(B);
+    return NULL;
+}
+
+/* Test block_left_multiply_vec with single block: y = A @ x */
+const char *test_block_left_multiply_vec_single_block()
+{
+    /* A is 2x3 CSR:
+     * [1.0  0.0  2.0]
+     * [0.0  3.0  0.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 3);
+    double Ax[3] = {1.0, 3.0, 2.0};
+    int Ai[3] = {0, 1, 2};
+    int Ap[3] = {0, 2, 3};
+    memcpy(A->x, Ax, 3 * sizeof(double));
+    memcpy(A->i, Ai, 3 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* x is (3*1)-length vector = [1.0, 2.0, 3.0] */
+    double x[3] = {1.0, 2.0, 3.0};
+    double y[2] = {0.0, 0.0};
+
+    block_left_multiply_vec(A, x, y, 1);
+
+    /* Expected: y = [1.0*1.0 + 0.0*2.0 + 2.0*3.0, 0.0*1.0 + 3.0*2.0 + 0.0*3.0]
+     *             = [1.0 + 6.0, 6.0]
+     *             = [7.0, 6.0]
+     */
+    double expected_y[2] = {7.0, 6.0};
+    mu_assert("y values incorrect", cmp_double_array(y, expected_y, 2));
+
+    free_csr_matrix(A);
+    return NULL;
+}
+
+/* Test block_left_multiply_vec with two blocks: y = [A @ x1; A @ x2] */
+const char *test_block_left_multiply_vec_two_blocks()
+{
+    /* A is 2x3 CSR:
+     * [1.0  2.0  0.0]
+     * [0.0  3.0  4.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    double Ax[4] = {1.0, 2.0, 3.0, 4.0};
+    int Ai[4] = {0, 1, 1, 2};
+    int Ap[3] = {0, 2, 4};
+    memcpy(A->x, Ax, 4 * sizeof(double));
+    memcpy(A->i, Ai, 4 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* x is (3*2)-length vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+     * x1 = [1.0, 2.0, 3.0], x2 = [4.0, 5.0, 6.0]
+     */
+    double x[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double y[4] = {0.0, 0.0, 0.0, 0.0};
+
+    block_left_multiply_vec(A, x, y, 2);
+
+    /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*2.0 + 4.0*3.0] =
+     * [5.0, 18.0] Expected block 2: y[2:4] = A @ x2 = [1.0*4.0 + 2.0*5.0, 3.0*5.0
+     * + 4.0*6.0] = [14.0, 39.0]
+     */
+    double expected_y[4] = {5.0, 18.0, 14.0, 39.0};
+    mu_assert("y values incorrect", cmp_double_array(y, expected_y, 4));
+
+    free_csr_matrix(A);
+    return NULL;
+}
+
+/* Test block_left_multiply_vec with sparse matrix and multiple blocks */
+const char *test_block_left_multiply_vec_sparse()
+{
+    /* A is 3x4 CSR (very sparse):
+     * [2.0  0.0  0.0  0.0]
+     * [0.0  0.0  3.0  0.0]
+     * [0.0  0.0  0.0  4.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(3, 4, 3);
+    double Ax[3] = {2.0, 3.0, 4.0};
+    int Ai[3] = {0, 2, 3};
+    int Ap[4] = {0, 1, 2, 3};
+    memcpy(A->x, Ax, 3 * sizeof(double));
+    memcpy(A->i, Ai, 3 * sizeof(int));
+    memcpy(A->p, Ap, 4 * sizeof(int));
+
+    /* x is (4*2)-length = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
+     * x1 = [1.0, 2.0, 3.0, 4.0], x2 = [5.0, 6.0, 7.0, 8.0]
+     */
+    double x[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+    double y[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+
+    block_left_multiply_vec(A, x, y, 2);
+
+    /* Expected block 1: y[0:3] = A @ x1 = [2.0*1.0, 3.0*3.0, 4.0*4.0] =
+     * [2.0, 9.0, 16.0] Expected block 2: y[3:6] = A @ x2 =
+     * [2.0*5.0, 3.0*7.0, 4.0*8.0] = [10.0, 21.0, 32.0]
+     */
+    double expected_y[6] = {2.0, 9.0, 16.0, 10.0, 21.0, 32.0};
+    mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6));
+
+    free_csr_matrix(A);
+    return NULL;
+}
+
+/* Test block_left_multiply_vec with three blocks */
+const char *test_block_left_multiply_vec_three_blocks()
+{
+    /* A is 2x2 CSR:
+     * [1.0  2.0]
+     * [3.0  4.0]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 2, 4);
+    double Ax[4] = {1.0, 2.0, 3.0, 4.0};
+    int Ai[4] = {0, 1, 0, 1};
+    int Ap[3] = {0, 2, 4};
+    memcpy(A->x, Ax, 4 * sizeof(double));
+    memcpy(A->i, Ai, 4 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    /* x is (2*3)-length = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+     * x1 = [1.0, 2.0], x2 = [3.0, 4.0], x3 = [5.0, 6.0]
+     */
+    double x[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double y[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+
+    block_left_multiply_vec(A, x, y, 3);
+
+    /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*1.0 + 4.0*2.0] =
+     * [5.0, 11.0] Expected block 2: y[2:4] = A @ x2 = [1.0*3.0 + 2.0*4.0, 3.0*3.0
+     * + 4.0*4.0] = [11.0, 25.0] Expected block 3: y[4:6] = A @ x3 = [1.0*5.0
+     * + 2.0*6.0, 3.0*5.0 + 4.0*6.0] = [17.0, 39.0]
+     */
+    double expected_y[6] = {5.0, 11.0, 11.0, 25.0, 17.0, 39.0};
+    mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6));
+
+    free_csr_matrix(A);
+    return NULL;
+}