SparseDifferentiation · dance858 · Feb 13, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -81,6 +81,8 @@ set_property(TARGET dnlp_diff PROPERTY POSITION_INDEPENDENT_CODE ON)
 # =============================================================================
 # C tests (only for standalone builds)
 # =============================================================================
+option(PROFILE_ONLY "Build only profiling tests" OFF)
+
 if(NOT SKBUILD)
     include_directories(${PROJECT_SOURCE_DIR}/tests)
     enable_testing()
@@ -90,5 +92,11 @@ if(NOT SKBUILD)
         tests/test_helpers.c
     )
     target_link_libraries(all_tests dnlp_diff)
+
+    if(PROFILE_ONLY)
+        target_compile_definitions(all_tests PRIVATE PROFILE_ONLY)
+        message(STATUS "Building ONLY profiling tests")
+    endif()
+
     add_test(NAME AllTests COMMAND all_tests)
 endif()
diff --git a/include/subexpr.h b/include/subexpr.h
@@ -109,7 +109,10 @@ typedef struct left_matmul_expr
     expr base;
     CSR_Matrix *A;
     CSR_Matrix *AT;
-    CSC_Matrix *CSC_work;
+    int n_blocks;
+    CSC_Matrix *Jchild_CSC;
+    CSC_Matrix *J_CSC;
+    int *csc_to_csr_workspace;
 } left_matmul_expr;
 
 /* Right matrix multiplication: y = f(x) * A where f(x) is an expression.

diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_Matrix.h
@@ -52,15 +52,7 @@ void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C)
 CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork);
 void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);
 
-/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC
- * Precomputes sparsity pattern. No workspace required.
- */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);
-
-/* Fill values of C = A @ B where A is CSR, B is CSC
- * C must have sparsity pattern already computed
- */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C);
+CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork);
+void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork);
 
 #endif /* CSC_MATRIX_H */
diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_Matrix.h
@@ -27,6 +27,7 @@ typedef struct CSR_Matrix
 
 /* Allocate a new CSR matrix with given dimensions and nnz */
 CSR_Matrix *new_csr_matrix(int m, int n, int nnz);
+CSR_Matrix *new_csr(const CSR_Matrix *A);
 
 /* Free a CSR matrix */
 void free_csr_matrix(CSR_Matrix *matrix);

diff --git a/include/utils/linalg.h b/include/utils/linalg.h
@@ -0,0 +1,46 @@
+#ifndef LINALG_H
+#define LINALG_H
+
+/* Forward declarations */
+struct CSR_Matrix;
+struct CSC_Matrix;
+
+/* Compute sparsity pattern and values for the matrix-matrix multiplication
+   C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k,
+   without relying on generic sparse matrix-matrix multiplication. Specialized
+   logic for this is much faster (50-100x) than generic sparse matmul.
+
+    * J is provided in CSC format and is split into p blocks of n rows each
+    * C is returned in CSC format
+    * Mathematically it corresponds to  C = [A @ J1; A @ J2; ...; A @ Jp],
+      where J = [J1; J2; ...; Jp]
+*/
+struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A,
+                                                     const struct CSC_Matrix *J,
+                                                     int p);
+
+void block_left_multiply_fill_values(const struct CSR_Matrix *A,
+                                     const struct CSC_Matrix *J,
+                                     struct CSC_Matrix *C);
+
+/* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector.
+   The output y is m*p-length vector corresponding to
+   y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n
+   elements.
+*/
+void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
+                             int p);
+
+/* Fill values of C = A @ B where A is CSR, B is CSC.
+ * C must have sparsity pattern already computed.
+ */
+void csr_csc_matmul_fill_values(const struct CSR_Matrix *A,
+                                const struct CSC_Matrix *B, struct CSR_Matrix *C);
+
+/* C = A @ B where A is CSR, B is CSC. Result C is CSR.
+ * Allocates and precomputes sparsity pattern. No workspace required.
+ */
+struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A,
+                                        const struct CSC_Matrix *B);
+
+#endif /* LINALG_H */
diff --git a/src/affine/broadcast.c b/src/affine/broadcast.c
@@ -89,7 +89,7 @@ static void jacobian_init(expr *node)
     else
     {
         /* Scalar broadcast: (1, 1) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->d1 * node->d2;
+        total_nnz = x->jacobian->nnz * node->size;
     }
 
     node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz);
@@ -99,10 +99,10 @@ static void jacobian_init(expr *node)
     // ---------------------------------------------------------------------
     CSR_Matrix *Jx = x->jacobian;
     CSR_Matrix *J = node->jacobian;
-    J->nnz = 0;
 
     if (bcast->type == BROADCAST_ROW)
     {
+        J->nnz = 0;
         for (int i = 0; i < node->d2; i++)
         {
             int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
@@ -117,22 +117,23 @@ static void jacobian_init(expr *node)
                 J->nnz += nnz_in_row;
             }
         }
+        assert(J->nnz == total_nnz);
         J->p[node->size] = total_nnz;
     }
     else if (bcast->type == BROADCAST_COL)
     {
-
         /* copy column indices */
         tile_int(J->i, Jx->i, Jx->nnz, node->d2);
 
         /* set row pointers */
         int offset = 0;
         for (int i = 0; i < node->d2; i++)
         {
+            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
             for (int j = 0; j < node->d1; j++)
             {
                 J->p[i * node->d1 + j] = offset;
-                offset += Jx->p[1] - Jx->p[0];
+                offset += nnz_in_row;
             }
         }
         assert(offset == total_nnz);
@@ -141,12 +142,12 @@ static void jacobian_init(expr *node)
     else
     {
         /* copy column indices */
-        tile_int(J->i, Jx->i, Jx->nnz, node->d1 * node->d2);
+        tile_int(J->i, Jx->i, Jx->nnz, node->size);
 
         /* set row pointers */
         int offset = 0;
         int nnz = Jx->p[1] - Jx->p[0];
-        for (int i = 0; i < node->d1 * node->d2; i++)
+        for (int i = 0; i < node->size; i++)
         {
             J->p[i] = offset;
             offset += nnz;
@@ -163,10 +164,10 @@ static void eval_jacobian(expr *node)
     broadcast_expr *bcast = (broadcast_expr *) node;
     CSR_Matrix *Jx = node->left->jacobian;
     CSR_Matrix *J = node->jacobian;
-    J->nnz = 0;
 
     if (bcast->type == BROADCAST_ROW)
     {
+        J->nnz = 0;
         for (int i = 0; i < node->d2; i++)
         {
             int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
@@ -180,7 +181,7 @@ static void eval_jacobian(expr *node)
     }
     else
     {
-        tile_double(J->x, Jx->x, Jx->nnz, node->d1 * node->d2);
+        tile_double(J->x, Jx->x, Jx->nnz, node->size);
     }
 }
 
@@ -268,9 +269,9 @@ expr *new_broadcast(expr *child, int d1, int d2)
     }
     else
     {
-        fprintf(
-            stderr,
-            "ERROR: inconsistency of broadcasting between DNLP-diff and CVXPY. \n");
+        fprintf(stderr,
+                "ERROR: inconsistency of broadcasting between SparseDifferentiation"
+                " and CVXPY. \n");
         exit(1);
     }
 

diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c
@@ -17,6 +17,8 @@
  */
 #include "bivariate.h"
 #include "subexpr.h"
+#include "utils/Timer.h"
+#include "utils/linalg.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -31,7 +33,9 @@
    * To compute the forward pass: vec(y) = A_kron @ vec(f(x)),
      where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p),
      or more specificely, a block-diagonal matrix with p blocks of A along the
-     diagonal.
+     diagonal. In the refactored implementation we don't form A_kron explicitly,
+     only conceptually. This led to a 100x speedup in the initialization of the
+     Jacobian sparsity pattern.
 
    * To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the
      Jacobian of f(x) of size (n*p) x n_vars.
@@ -42,7 +46,8 @@
     Working in terms of A_kron unifies the implementation of f(x) being
     vector-valued or matrix-valued.
 
-
+    I (dance858) think we can get additional big speedups when A is dense by
+    introducing a dense matrix class.
 */
 
 #include "utils/utils.h"
@@ -55,7 +60,9 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A_kron @ vec(f(x)) */
-    csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value);
+    CSR_Matrix *A = ((left_matmul_expr *) node)->A;
+    int n_blocks = ((left_matmul_expr *) node)->n_blocks;
+    block_left_multiply_vec(A, x->value, node->value, n_blocks);
 }
 
 static bool is_affine(const expr *node)
@@ -68,39 +75,47 @@ static void free_type_data(expr *node)
     left_matmul_expr *lin_node = (left_matmul_expr *) node;
     free_csr_matrix(lin_node->A);
     free_csr_matrix(lin_node->AT);
-    if (lin_node->CSC_work)
-    {
-        free_csc_matrix(lin_node->CSC_work);
-    }
+    free_csc_matrix(lin_node->Jchild_CSC);
+    free_csc_matrix(lin_node->J_CSC);
+    free(lin_node->csc_to_csr_workspace);
     lin_node->A = NULL;
     lin_node->AT = NULL;
-    lin_node->CSC_work = NULL;
+    lin_node->Jchild_CSC = NULL;
+    lin_node->J_CSC = NULL;
+    lin_node->csc_to_csr_workspace = NULL;
 }
 
 static void jacobian_init(expr *node)
 {
     expr *x = node->left;
     left_matmul_expr *lin_node = (left_matmul_expr *) node;
 
-    /* initialize child's jacobian and precompute sparsity of its transpose */
+    /* initialize child's jacobian and precompute sparsity of its CSC */
     x->jacobian_init(x);
-    lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+    lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
 
-    /* precompute sparsity of this node's jacobian */
-    node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work);
+    /* precompute sparsity of this node's jacobian in CSC and CSR */
+    lin_node->J_CSC = block_left_multiply_fill_sparsity(
+        lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks);
+    node->jacobian =
+        csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace);
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+
+    CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC;
+    CSC_Matrix *J_CSC = lnode->J_CSC;
 
     /* evaluate child's jacobian and convert to CSC */
     x->eval_jacobian(x);
-    csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork);
+    csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork);
 
-    /* compute this node's jacobian */
-    csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian);
+    /* compute this node's jacobian: */
+    block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC);
+    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace);
 }
 
 static void wsum_hess_init(expr *node)
@@ -115,15 +130,17 @@ static void wsum_hess_init(expr *node)
     memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));
 
     /* work for computing A^T w*/
-    int A_n = ((left_matmul_expr *) node)->A->n;
-    node->dwork = (double *) malloc(A_n * sizeof(double));
+    int n_blocks = ((left_matmul_expr *) node)->n_blocks;
+    int dim = ((left_matmul_expr *) node)->A->n * n_blocks;
+    node->dwork = (double *) malloc(dim * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
     /* compute A^T w*/
-    left_matmul_expr *lin_node = (left_matmul_expr *) node;
-    csr_matvec_wo_offset(lin_node->AT, w, node->dwork);
+    CSR_Matrix *AT = ((left_matmul_expr *) node)->AT;
+    int n_blocks = ((left_matmul_expr *) node)->n_blocks;
+    block_left_multiply_vec(AT, w, node->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->dwork);
     memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
@@ -132,10 +149,10 @@ static void eval_wsum_hess(expr *node, const double *w)
 
 expr *new_left_matmul(expr *u, const CSR_Matrix *A)
 {
-    /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users to do
-       A @ u where u is (n, ) which in C is actually (1, n). In that case the result
-       of A @ u is (m, ), which is (1, m) according to broadcasting rules. We
-       therefore check if this is the case. */
+    /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users
+       to do A @ u where u is (n, ) which in C is actually (1, n). In that case
+       the result of A @ u is (m, ), which is (1, m) according to broadcasting
+       rules. We therefore check if this is the case. */
     int d1, d2, n_blocks;
     if (u->d1 == A->n)
     {
@@ -164,12 +181,17 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A)
     node->left = u;
     expr_retain(u);
 
-    /* Initialize type-specific fields */
-    lin_node->A = block_diag_repeat_csr(A, n_blocks);
-    int alloc = MAX(lin_node->A->n, node->n_vars);
-    node->iwork = (int *) malloc(alloc * sizeof(int));
+    /* allocate workspace. iwork is used for transposing A (requiring size A->n)
+       and for converting J_child csr to csc (requring size node->n_vars).
+       csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size)
+     */
+    node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int));
+    lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
+    lin_node->n_blocks = n_blocks;
+
+    /* store A and AT */
+    lin_node->A = new_csr(A);
     lin_node->AT = transpose(lin_node->A, node->iwork);
-    lin_node->CSC_work = NULL;
 
     return node;
 }
diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c
@@ -17,6 +17,7 @@
  */
 #include "bivariate.h"
 #include "subexpr.h"
+#include "utils/linalg.h"
 #include <stdlib.h>
 
 /* This file implements the atom 'right_matmul' corresponding to the operation y =

diff --git a/src/problem.c b/src/problem.c
@@ -246,7 +246,7 @@ static inline void print_end_message(const Diff_engine_stats *stats)
 {
     printf("\n"
            "============================================================\n"
-           "              DNLP Differentiation Engine v%s\n"
+           "                SparseDifferentiation v%s\n"
            "  (c) D. Cederberg and W. Zhang, Stanford University, 2026\n"
            "============================================================\n",
            DIFF_ENGINE_VERSION);