Skip to content
Merged
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ set_property(TARGET dnlp_diff PROPERTY POSITION_INDEPENDENT_CODE ON)
# =============================================================================
# C tests (only for standalone builds)
# =============================================================================
option(PROFILE_ONLY "Build only profiling tests" OFF)

if(NOT SKBUILD)
include_directories(${PROJECT_SOURCE_DIR}/tests)
enable_testing()
Expand All @@ -90,5 +92,11 @@ if(NOT SKBUILD)
tests/test_helpers.c
)
target_link_libraries(all_tests dnlp_diff)

if(PROFILE_ONLY)
target_compile_definitions(all_tests PRIVATE PROFILE_ONLY)
message(STATUS "Building ONLY profiling tests")
endif()

add_test(NAME AllTests COMMAND all_tests)
endif()
5 changes: 4 additions & 1 deletion include/subexpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ typedef struct left_matmul_expr
expr base;
CSR_Matrix *A;
CSR_Matrix *AT;
CSC_Matrix *CSC_work;
int n_blocks;
CSC_Matrix *Jchild_CSC;
CSC_Matrix *J_CSC;
int *csc_to_csr_workspace;
} left_matmul_expr;

/* Right matrix multiplication: y = f(x) * A where f(x) is an expression.
Expand Down
12 changes: 2 additions & 10 deletions include/utils/CSC_Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,7 @@ void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C)
CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork);
void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);

/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC
* Precomputes sparsity pattern. No workspace required.
*/
CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);

/* Fill values of C = A @ B where A is CSR, B is CSC
* C must have sparsity pattern already computed
*/
void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
CSR_Matrix *C);
CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork);
void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork);

#endif /* CSC_MATRIX_H */
1 change: 1 addition & 0 deletions include/utils/CSR_Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ typedef struct CSR_Matrix

/* Allocate a new CSR matrix with given dimensions and nnz */
CSR_Matrix *new_csr_matrix(int m, int n, int nnz);
CSR_Matrix *new_csr(const CSR_Matrix *A);

/* Free a CSR matrix */
void free_csr_matrix(CSR_Matrix *matrix);
Expand Down
46 changes: 46 additions & 0 deletions include/utils/linalg.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef LINALG_H
#define LINALG_H

/* Forward declarations */
struct CSR_Matrix;
struct CSC_Matrix;

/* Compute sparsity pattern and values for the matrix-matrix multiplication
C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k,
without relying on generic sparse matrix-matrix multiplication. Specialized
logic for this is much faster (50-100x) than generic sparse matmul.

* J is provided in CSC format and is split into p blocks of n rows each
* C is returned in CSC format
* Mathematically it corresponds to C = [A @ J1; A @ J2; ...; A @ Jp],
where J = [J1; J2; ...; Jp]
*/
struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A,
const struct CSC_Matrix *J,
int p);

void block_left_multiply_fill_values(const struct CSR_Matrix *A,
const struct CSC_Matrix *J,
struct CSC_Matrix *C);

/* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector.
The output y is m*p-length vector corresponding to
y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n
elements.
*/
void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
int p);

/* Fill values of C = A @ B where A is CSR, B is CSC.
* C must have sparsity pattern already computed.
*/
void csr_csc_matmul_fill_values(const struct CSR_Matrix *A,
const struct CSC_Matrix *B, struct CSR_Matrix *C);

/* C = A @ B where A is CSR, B is CSC. Result C is CSR.
* Allocates and precomputes sparsity pattern. No workspace required.
*/
struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A,
const struct CSC_Matrix *B);

#endif /* LINALG_H */
23 changes: 12 additions & 11 deletions src/affine/broadcast.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ static void jacobian_init(expr *node)
else
{
/* Scalar broadcast: (1, 1) -> (m, n) */
total_nnz = x->jacobian->nnz * node->d1 * node->d2;
total_nnz = x->jacobian->nnz * node->size;
}

node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz);
Expand All @@ -99,10 +99,10 @@ static void jacobian_init(expr *node)
// ---------------------------------------------------------------------
CSR_Matrix *Jx = x->jacobian;
CSR_Matrix *J = node->jacobian;
J->nnz = 0;

if (bcast->type == BROADCAST_ROW)
{
J->nnz = 0;
for (int i = 0; i < node->d2; i++)
{
int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
Expand All @@ -117,22 +117,23 @@ static void jacobian_init(expr *node)
J->nnz += nnz_in_row;
}
}
assert(J->nnz == total_nnz);
J->p[node->size] = total_nnz;
}
else if (bcast->type == BROADCAST_COL)
{

/* copy column indices */
tile_int(J->i, Jx->i, Jx->nnz, node->d2);

/* set row pointers */
int offset = 0;
for (int i = 0; i < node->d2; i++)
{
int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
for (int j = 0; j < node->d1; j++)
{
J->p[i * node->d1 + j] = offset;
offset += Jx->p[1] - Jx->p[0];
offset += nnz_in_row;
}
}
assert(offset == total_nnz);
Expand All @@ -141,12 +142,12 @@ static void jacobian_init(expr *node)
else
{
/* copy column indices */
tile_int(J->i, Jx->i, Jx->nnz, node->d1 * node->d2);
tile_int(J->i, Jx->i, Jx->nnz, node->size);

/* set row pointers */
int offset = 0;
int nnz = Jx->p[1] - Jx->p[0];
for (int i = 0; i < node->d1 * node->d2; i++)
for (int i = 0; i < node->size; i++)
{
J->p[i] = offset;
offset += nnz;
Expand All @@ -163,10 +164,10 @@ static void eval_jacobian(expr *node)
broadcast_expr *bcast = (broadcast_expr *) node;
CSR_Matrix *Jx = node->left->jacobian;
CSR_Matrix *J = node->jacobian;
J->nnz = 0;

if (bcast->type == BROADCAST_ROW)
{
J->nnz = 0;
for (int i = 0; i < node->d2; i++)
{
int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
Expand All @@ -180,7 +181,7 @@ static void eval_jacobian(expr *node)
}
else
{
tile_double(J->x, Jx->x, Jx->nnz, node->d1 * node->d2);
tile_double(J->x, Jx->x, Jx->nnz, node->size);
}
}

Expand Down Expand Up @@ -268,9 +269,9 @@ expr *new_broadcast(expr *child, int d1, int d2)
}
else
{
fprintf(
stderr,
"ERROR: inconsistency of broadcasting between DNLP-diff and CVXPY. \n");
fprintf(stderr,
"ERROR: inconsistency of broadcasting between SparseDifferentiation"
" and CVXPY. \n");
exit(1);
}

Expand Down
80 changes: 51 additions & 29 deletions src/bivariate/left_matmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/
#include "bivariate.h"
#include "subexpr.h"
#include "utils/Timer.h"
#include "utils/linalg.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
Expand All @@ -31,7 +33,9 @@
* To compute the forward pass: vec(y) = A_kron @ vec(f(x)),
where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p),
or more specificely, a block-diagonal matrix with p blocks of A along the
diagonal.
diagonal. In the refactored implementation we don't form A_kron explicitly,
only conceptually. This led to a 100x speedup in the initialization of the
Jacobian sparsity pattern.

* To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the
Jacobian of f(x) of size (n*p) x n_vars.
Expand All @@ -42,7 +46,8 @@
Working in terms of A_kron unifies the implementation of f(x) being
vector-valued or matrix-valued.


I (dance858) think we can get additional big speedups when A is dense by
introducing a dense matrix class.
*/

#include "utils/utils.h"
Expand All @@ -55,7 +60,9 @@ static void forward(expr *node, const double *u)
node->left->forward(node->left, u);

/* y = A_kron @ vec(f(x)) */
csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value);
CSR_Matrix *A = ((left_matmul_expr *) node)->A;
int n_blocks = ((left_matmul_expr *) node)->n_blocks;
block_left_multiply_vec(A, x->value, node->value, n_blocks);
}

static bool is_affine(const expr *node)
Expand All @@ -68,39 +75,47 @@ static void free_type_data(expr *node)
left_matmul_expr *lin_node = (left_matmul_expr *) node;
free_csr_matrix(lin_node->A);
free_csr_matrix(lin_node->AT);
if (lin_node->CSC_work)
{
free_csc_matrix(lin_node->CSC_work);
}
free_csc_matrix(lin_node->Jchild_CSC);
free_csc_matrix(lin_node->J_CSC);
free(lin_node->csc_to_csr_workspace);
lin_node->A = NULL;
lin_node->AT = NULL;
lin_node->CSC_work = NULL;
lin_node->Jchild_CSC = NULL;
lin_node->J_CSC = NULL;
lin_node->csc_to_csr_workspace = NULL;
}

static void jacobian_init(expr *node)
{
expr *x = node->left;
left_matmul_expr *lin_node = (left_matmul_expr *) node;

/* initialize child's jacobian and precompute sparsity of its transpose */
/* initialize child's jacobian and precompute sparsity of its CSC */
x->jacobian_init(x);
lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);

/* precompute sparsity of this node's jacobian */
node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work);
/* precompute sparsity of this node's jacobian in CSC and CSR */
lin_node->J_CSC = block_left_multiply_fill_sparsity(
lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks);
node->jacobian =
csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace);
}

static void eval_jacobian(expr *node)
{
expr *x = node->left;
left_matmul_expr *lin_node = (left_matmul_expr *) node;
left_matmul_expr *lnode = (left_matmul_expr *) node;

CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC;
CSC_Matrix *J_CSC = lnode->J_CSC;

/* evaluate child's jacobian and convert to CSC */
x->eval_jacobian(x);
csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork);
csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork);

/* compute this node's jacobian */
csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian);
/* compute this node's jacobian: */
block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC);
csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace);
}

static void wsum_hess_init(expr *node)
Expand All @@ -115,15 +130,17 @@ static void wsum_hess_init(expr *node)
memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));

/* work for computing A^T w*/
int A_n = ((left_matmul_expr *) node)->A->n;
node->dwork = (double *) malloc(A_n * sizeof(double));
int n_blocks = ((left_matmul_expr *) node)->n_blocks;
int dim = ((left_matmul_expr *) node)->A->n * n_blocks;
node->dwork = (double *) malloc(dim * sizeof(double));
}

static void eval_wsum_hess(expr *node, const double *w)
{
/* compute A^T w*/
left_matmul_expr *lin_node = (left_matmul_expr *) node;
csr_matvec_wo_offset(lin_node->AT, w, node->dwork);
CSR_Matrix *AT = ((left_matmul_expr *) node)->AT;
int n_blocks = ((left_matmul_expr *) node)->n_blocks;
block_left_multiply_vec(AT, w, node->dwork, n_blocks);

node->left->eval_wsum_hess(node->left, node->dwork);
memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
Expand All @@ -132,10 +149,10 @@ static void eval_wsum_hess(expr *node, const double *w)

expr *new_left_matmul(expr *u, const CSR_Matrix *A)
{
/* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users to do
A @ u where u is (n, ) which in C is actually (1, n). In that case the result
of A @ u is (m, ), which is (1, m) according to broadcasting rules. We
therefore check if this is the case. */
/* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users
to do A @ u where u is (n, ) which in C is actually (1, n). In that case
the result of A @ u is (m, ), which is (1, m) according to broadcasting
rules. We therefore check if this is the case. */
int d1, d2, n_blocks;
if (u->d1 == A->n)
{
Expand Down Expand Up @@ -164,12 +181,17 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A)
node->left = u;
expr_retain(u);

/* Initialize type-specific fields */
lin_node->A = block_diag_repeat_csr(A, n_blocks);
int alloc = MAX(lin_node->A->n, node->n_vars);
node->iwork = (int *) malloc(alloc * sizeof(int));
/* allocate workspace. iwork is used for transposing A (requiring size A->n)
and for converting J_child csr to csc (requring size node->n_vars).
csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size)
*/
node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int));
lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
lin_node->n_blocks = n_blocks;

/* store A and AT */
lin_node->A = new_csr(A);
lin_node->AT = transpose(lin_node->A, node->iwork);
lin_node->CSC_work = NULL;

return node;
}
1 change: 1 addition & 0 deletions src/bivariate/right_matmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
#include "bivariate.h"
#include "subexpr.h"
#include "utils/linalg.h"
#include <stdlib.h>

/* This file implements the atom 'right_matmul' corresponding to the operation y =
Expand Down
2 changes: 1 addition & 1 deletion src/problem.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ static inline void print_end_message(const Diff_engine_stats *stats)
{
printf("\n"
"============================================================\n"
" DNLP Differentiation Engine v%s\n"
" SparseDifferentiation v%s\n"
" (c) D. Cederberg and W. Zhang, Stanford University, 2026\n"
"============================================================\n",
DIFF_ENGINE_VERSION);
Expand Down
Loading