SparseDifferentiation · Transurgeon · Mar 11, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -13,12 +13,24 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         build_type: [Release, Debug]
-        
+
     steps:
     - uses: actions/checkout@v3
 
+    - name: Install BLAS (Ubuntu)
+      if: runner.os == 'Linux'
+      run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
+
+    - name: Install OpenBLAS (Windows)
+      if: runner.os == 'Windows'
+      run: vcpkg install openblas:x64-windows
+
     - name: Configure CMake
-      run: cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_VERBOSE_MAKEFILE=ON
+      run: >
+        cmake -B build -S .
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DCMAKE_VERBOSE_MAKEFILE=ON
+        ${{ runner.os == 'Windows' && '-DCMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake' || '' }}
 
     - name: Build
       run: cmake --build build --config ${{ matrix.build_type }}

diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml
@@ -16,6 +16,10 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
+      - name: Install BLAS (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
+
       # Configure CMake with ASan + UBSan
       - name: Configure with sanitizers
         run: cmake -B build -S . \

diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml
@@ -11,7 +11,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y valgrind
+          sudo apt-get install -y valgrind libopenblas-dev
 
       - name: Configure Debug build
         run: cmake -B build -S . -DCMAKE_BUILD_TYPE=Debug

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.15)
 project(DNLP_Diff_Engine C)
 set(CMAKE_C_STANDARD 99)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(DIFF_ENGINE_VERSION_MAJOR 0)
 set(DIFF_ENGINE_VERSION_MINOR 1)
@@ -53,6 +54,18 @@ if(NOT MSVC)
     target_link_libraries(dnlp_diff m)
 endif()
 
+# Find and link BLAS
+if(APPLE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    target_link_libraries(dnlp_diff ${ACCELERATE_FRAMEWORK})
+elseif(MSVC)
+    find_package(OpenBLAS CONFIG REQUIRED)
+    target_link_libraries(dnlp_diff OpenBLAS::OpenBLAS)
+else()
+    find_package(BLAS REQUIRED)
+    target_link_libraries(dnlp_diff ${BLAS_LIBRARIES})
+endif()
+
 # Config-specific compile options (compiler-specific)
 if(MSVC)
     target_compile_options(dnlp_diff PRIVATE

diff --git a/include/bivariate.h b/include/bivariate.h
@@ -30,12 +30,18 @@ expr *new_rel_entr_second_arg_scalar(expr *left, expr *right);
 /* Matrix multiplication: Z = X @ Y */
 expr *new_matmul(expr *x, expr *y);
 
-/* Left matrix multiplication: A @ f(x) where A is a constant matrix */
+/* Left matrix multiplication: A @ f(x) where A is a constant sparse matrix */
 expr *new_left_matmul(expr *u, const CSR_Matrix *A);
 
+/* Left matrix multiplication: A @ f(x) where A is a constant dense matrix
+ * (row-major, m x n). Uses CBLAS for efficient computation. */
+expr *new_left_matmul_dense(expr *u, int m, int n, const double *data);
+
 /* Right matrix multiplication: f(x) @ A where A is a constant matrix */
 expr *new_right_matmul(expr *u, const CSR_Matrix *A);
 
+expr *new_right_matmul_dense(expr *u, int m, int n, const double *data);
+
 /* Constant scalar multiplication: a * f(x) where a is a constant double */
 expr *new_const_scalar_mult(double a, expr *child);
 

diff --git a/include/subexpr.h b/include/subexpr.h
@@ -21,6 +21,7 @@
 #include "expr.h"
 #include "utils/CSC_Matrix.h"
 #include "utils/CSR_Matrix.h"
+#include "utils/matrix.h"
 
 /* Forward declaration */
 struct int_double_pair;
@@ -107,12 +108,12 @@ important distinction compared to linear_op_expr. */
 typedef struct left_matmul_expr
 {
     expr base;
-    CSR_Matrix *A;
-    CSR_Matrix *AT;
+    Matrix *A;
+    Matrix *AT;
     int n_blocks;
     CSC_Matrix *Jchild_CSC;
     CSC_Matrix *J_CSC;
-    int *csc_to_csr_workspace;
+    int *csc_to_csr_work;
 } left_matmul_expr;
 
 /* Right matrix multiplication: y = f(x) * A where f(x) is an expression.

diff --git a/include/utils/cblas_wrapper.h b/include/utils/cblas_wrapper.h
@@ -0,0 +1,11 @@
+#ifndef CBLAS_WRAPPER_H
+#define CBLAS_WRAPPER_H
+
+#ifdef __APPLE__
+#define ACCELERATE_NEW_LAPACK
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
+
+#endif /* CBLAS_WRAPPER_H */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MATRIX_H
+#define MATRIX_H
+
+#include "CSC_Matrix.h"
+#include "CSR_Matrix.h"
+
+/* Base matrix type with function pointers for polymorphic dispatch */
+typedef struct Matrix
+{
+    int m, n;
+    void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
+                                double *y, int p);
+    CSC_Matrix *(*block_left_mult_sparsity)(const struct Matrix *self,
+                                            const CSC_Matrix *J, int p);
+    void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
+                                   CSC_Matrix *C);
+    void (*free_fn)(struct Matrix *self);
+} Matrix;
+
+/* Sparse matrix wrapping CSR */
+typedef struct Sparse_Matrix
+{
+    Matrix base;
+    CSR_Matrix *csr;
+} Sparse_Matrix;
+
+/* Dense matrix (row-major) */
+typedef struct Dense_Matrix
+{
+    Matrix base;
+    double *x;
+    double *work; /* scratch buffer, length n */
+} Dense_Matrix;
+
+/* Constructors */
+Matrix *new_sparse_matrix(const CSR_Matrix *A);
+Matrix *new_dense_matrix(int m, int n, const double *data);
+
+/* Transpose helpers */
+Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
+Matrix *dense_matrix_trans(const Dense_Matrix *self);
+
+/* Free helper */
+static inline void free_matrix(Matrix *m)
+{
+    if (m)
+    {
+        m->free_fn(m);
+    }
+}
+
+#endif /* MATRIX_H */
diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c
@@ -17,8 +17,7 @@
  */
 #include "bivariate.h"
 #include "subexpr.h"
-#include "utils/Timer.h"
-#include "utils/linalg_sparse_matmuls.h"
+#include "utils/matrix.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -45,9 +44,6 @@
 
     Working in terms of A_kron unifies the implementation of f(x) being
     vector-valued or matrix-valued.
-
-    I (dance858) think we can get additional big speedups when A is dense by
-    introducing a dense matrix class.
 */
 
 #include "utils/utils.h"
@@ -60,9 +56,9 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A_kron @ vec(f(x)) */
-    CSR_Matrix *A = ((left_matmul_expr *) node)->A;
+    Matrix *A = ((left_matmul_expr *) node)->A;
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
-    block_left_multiply_vec(A, x->value, node->value, n_blocks);
+    A->block_left_mult_vec(A, x->value, node->value, n_blocks);
 }
 
 static bool is_affine(const expr *node)
@@ -72,33 +68,32 @@ static bool is_affine(const expr *node)
 
 static void free_type_data(expr *node)
 {
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
-    free_csr_matrix(lin_node->A);
-    free_csr_matrix(lin_node->AT);
-    free_csc_matrix(lin_node->Jchild_CSC);
-    free_csc_matrix(lin_node->J_CSC);
-    free(lin_node->csc_to_csr_workspace);
-    lin_node->A = NULL;
-    lin_node->AT = NULL;
-    lin_node->Jchild_CSC = NULL;
-    lin_node->J_CSC = NULL;
-    lin_node->csc_to_csr_workspace = NULL;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+    free_matrix(lnode->A);
+    free_matrix(lnode->AT);
+    free_csc_matrix(lnode->Jchild_CSC);
+    free_csc_matrix(lnode->J_CSC);
+    free(lnode->csc_to_csr_work);
+    lnode->A = NULL;
+    lnode->AT = NULL;
+    lnode->Jchild_CSC = NULL;
+    lnode->J_CSC = NULL;
+    lnode->csc_to_csr_work = NULL;
 }
 
 static void jacobian_init(expr *node)
 {
     expr *x = node->left;
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
 
     /* initialize child's jacobian and precompute sparsity of its CSC */
     x->jacobian_init(x);
-    lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+    lnode->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
 
     /* precompute sparsity of this node's jacobian in CSC and CSR */
-    lin_node->J_CSC = block_left_multiply_fill_sparsity(
-        lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks);
-    node->jacobian =
-        csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace);
+    lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
+                                                      lnode->n_blocks);
+    node->jacobian = csc_to_csr_fill_sparsity(lnode->J_CSC, lnode->csc_to_csr_work);
 }
 
 static void eval_jacobian(expr *node)
@@ -114,8 +109,8 @@ static void eval_jacobian(expr *node)
     csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork);
 
     /* compute this node's jacobian: */
-    block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC);
-    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace);
+    lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
+    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_work);
 }
 
 static void wsum_hess_init(expr *node)
@@ -131,16 +126,16 @@ static void wsum_hess_init(expr *node)
 
     /* work for computing A^T w*/
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
-    int dim = ((left_matmul_expr *) node)->A->n * n_blocks;
+    int dim = ((left_matmul_expr *) node)->AT->m * n_blocks;
     node->dwork = (double *) malloc(dim * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
     /* compute A^T w*/
-    CSR_Matrix *AT = ((left_matmul_expr *) node)->AT;
+    Matrix *AT = ((left_matmul_expr *) node)->AT;
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
-    block_left_multiply_vec(AT, w, node->dwork, n_blocks);
+    AT->block_left_mult_vec(AT, w, node->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->dwork);
     memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
@@ -173,25 +168,64 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A)
     }
 
     /* Allocate the type-specific struct */
-    left_matmul_expr *lin_node =
+    left_matmul_expr *lnode =
         (left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
-    expr *node = &lin_node->base;
+    expr *node = &lnode->base;
     init_expr(node, d1, d2, u->n_vars, forward, jacobian_init, eval_jacobian,
               is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
     node->left = u;
     expr_retain(u);
 
-    /* allocate workspace. iwork is used for transposing A (requiring size A->n)
-       and for converting J_child csr to csc (requring size node->n_vars).
-       csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size)
-     */
+    /* allocate workspace. iwork is used for converting J_child csr to csc
+       (requiring size node->n_vars) and for transposing A (requiring size A->n).
+       csc_to_csr_work is used for converting J_CSC to CSR (requiring
+       node->size) */
     node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int));
-    lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
-    lin_node->n_blocks = n_blocks;
+    lnode->csc_to_csr_work = (int *) malloc(node->size * sizeof(int));
+    lnode->n_blocks = n_blocks;
 
     /* store A and AT */
-    lin_node->A = new_csr(A);
-    lin_node->AT = transpose(lin_node->A, node->iwork);
+    lnode->A = new_sparse_matrix(A);
+    lnode->AT = sparse_matrix_trans((const Sparse_Matrix *) lnode->A, node->iwork);
+
+    return node;
+}
+
+expr *new_left_matmul_dense(expr *u, int m, int n, const double *data)
+{
+    int d1, d2, n_blocks;
+    if (u->d1 == n)
+    {
+        d1 = m;
+        d2 = u->d2;
+        n_blocks = u->d2;
+    }
+    else if (u->d2 == n && u->d1 == 1)
+    {
+        d1 = 1;
+        d2 = m;
+        n_blocks = 1;
+    }
+    else
+    {
+        fprintf(stderr, "Error in new_left_matmul_dense: dimension mismatch\n");
+        exit(1);
+    }
+
+    left_matmul_expr *lnode =
+        (left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
+    expr *node = &lnode->base;
+    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
+    node->left = u;
+    expr_retain(u);
+
+    node->iwork = (int *) malloc(MAX(n, node->n_vars) * sizeof(int));
+    lnode->csc_to_csr_work = (int *) malloc(node->size * sizeof(int));
+    lnode->n_blocks = n_blocks;
+
+    lnode->A = new_dense_matrix(m, n, data);
+    lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
 
     return node;
 }