Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,24 @@ jobs:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
build_type: [Release, Debug]

steps:
- uses: actions/checkout@v3

- name: Install BLAS (Ubuntu)
if: runner.os == 'Linux'
run: sudo apt-get update && sudo apt-get install -y libopenblas-dev

- name: Install OpenBLAS (Windows)
if: runner.os == 'Windows'
run: vcpkg install openblas:x64-windows

- name: Configure CMake
run: cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_VERBOSE_MAKEFILE=ON
run: >
cmake -B build -S .
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DCMAKE_VERBOSE_MAKEFILE=ON
${{ runner.os == 'Windows' && '-DCMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake' || '' }}

- name: Build
run: cmake --build build --config ${{ matrix.build_type }}
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/sanitizer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ jobs:
steps:
- uses: actions/checkout@v5

- name: Install BLAS (Ubuntu)
if: runner.os == 'Linux'
run: sudo apt-get update && sudo apt-get install -y libopenblas-dev

# Configure CMake with ASan + UBSan
- name: Configure with sanitizers
run: cmake -B build -S . \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/valgrind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y valgrind
sudo apt-get install -y valgrind libopenblas-dev

- name: Configure Debug build
run: cmake -B build -S . -DCMAKE_BUILD_TYPE=Debug
Expand Down
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.15)
project(DNLP_Diff_Engine C)
set(CMAKE_C_STANDARD 99)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(DIFF_ENGINE_VERSION_MAJOR 0)
set(DIFF_ENGINE_VERSION_MINOR 1)
Expand Down Expand Up @@ -53,6 +54,18 @@ if(NOT MSVC)
target_link_libraries(dnlp_diff m)
endif()

# Find and link BLAS
if(APPLE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
target_link_libraries(dnlp_diff ${ACCELERATE_FRAMEWORK})
elseif(MSVC)
find_package(OpenBLAS CONFIG REQUIRED)
target_link_libraries(dnlp_diff OpenBLAS::OpenBLAS)
else()
find_package(BLAS REQUIRED)
target_link_libraries(dnlp_diff ${BLAS_LIBRARIES})
endif()

# Config-specific compile options (compiler-specific)
if(MSVC)
target_compile_options(dnlp_diff PRIVATE
Expand Down
8 changes: 7 additions & 1 deletion include/bivariate.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,18 @@ expr *new_rel_entr_second_arg_scalar(expr *left, expr *right);
/* Matrix multiplication: Z = X @ Y */
expr *new_matmul(expr *x, expr *y);

/* Left matrix multiplication: A @ f(x) where A is a constant matrix */
/* Left matrix multiplication: A @ f(x) where A is a constant sparse matrix */
expr *new_left_matmul(expr *u, const CSR_Matrix *A);

/* Left matrix multiplication: A @ f(x) where A is a constant dense matrix
* (row-major, m x n). Uses CBLAS for efficient computation. */
expr *new_left_matmul_dense(expr *u, int m, int n, const double *data);

/* Right matrix multiplication: f(x) @ A where A is a constant matrix */
expr *new_right_matmul(expr *u, const CSR_Matrix *A);

expr *new_right_matmul_dense(expr *u, int m, int n, const double *data);

/* Constant scalar multiplication: a * f(x) where a is a constant double */
expr *new_const_scalar_mult(double a, expr *child);

Expand Down
7 changes: 4 additions & 3 deletions include/subexpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "expr.h"
#include "utils/CSC_Matrix.h"
#include "utils/CSR_Matrix.h"
#include "utils/matrix.h"

/* Forward declaration */
struct int_double_pair;
Expand Down Expand Up @@ -107,12 +108,12 @@ important distinction compared to linear_op_expr. */
typedef struct left_matmul_expr
{
expr base;
CSR_Matrix *A;
CSR_Matrix *AT;
Matrix *A;
Matrix *AT;
int n_blocks;
CSC_Matrix *Jchild_CSC;
CSC_Matrix *J_CSC;
int *csc_to_csr_workspace;
int *csc_to_csr_work;
} left_matmul_expr;

/* Right matrix multiplication: y = f(x) * A where f(x) is an expression.
Expand Down
11 changes: 11 additions & 0 deletions include/utils/cblas_wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef CBLAS_WRAPPER_H
#define CBLAS_WRAPPER_H

#ifdef __APPLE__
#define ACCELERATE_NEW_LAPACK
#include <Accelerate/Accelerate.h>
#else
#include <cblas.h>
#endif

#endif /* CBLAS_WRAPPER_H */
69 changes: 69 additions & 0 deletions include/utils/matrix.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright 2026 Daniel Cederberg and William Zhang
*
* This file is part of the DNLP-differentiation-engine project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MATRIX_H
#define MATRIX_H

#include "CSC_Matrix.h"
#include "CSR_Matrix.h"

/* Base matrix type with function pointers for polymorphic dispatch */
typedef struct Matrix
{
int m, n;
void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
double *y, int p);
CSC_Matrix *(*block_left_mult_sparsity)(const struct Matrix *self,
const CSC_Matrix *J, int p);
void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
CSC_Matrix *C);
void (*free_fn)(struct Matrix *self);
} Matrix;

/* Sparse matrix wrapping CSR */
typedef struct Sparse_Matrix
{
Matrix base;
CSR_Matrix *csr;
} Sparse_Matrix;

/* Dense matrix (row-major) */
typedef struct Dense_Matrix
{
Matrix base;
double *x;
double *work; /* scratch buffer, length n */
} Dense_Matrix;

/* Constructors */
Matrix *new_sparse_matrix(const CSR_Matrix *A);
Matrix *new_dense_matrix(int m, int n, const double *data);

/* Transpose helpers */
Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
Matrix *dense_matrix_trans(const Dense_Matrix *self);

/* Free helper */
static inline void free_matrix(Matrix *m)
{
if (m)
{
m->free_fn(m);
}
}

#endif /* MATRIX_H */
112 changes: 73 additions & 39 deletions src/bivariate/left_matmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
*/
#include "bivariate.h"
#include "subexpr.h"
#include "utils/Timer.h"
#include "utils/linalg_sparse_matmuls.h"
#include "utils/matrix.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
Expand All @@ -45,9 +44,6 @@

Working in terms of A_kron unifies the implementation of f(x) being
vector-valued or matrix-valued.

I (dance858) think we can get additional big speedups when A is dense by
introducing a dense matrix class.
*/

#include "utils/utils.h"
Expand All @@ -60,9 +56,9 @@ static void forward(expr *node, const double *u)
node->left->forward(node->left, u);

/* y = A_kron @ vec(f(x)) */
CSR_Matrix *A = ((left_matmul_expr *) node)->A;
Matrix *A = ((left_matmul_expr *) node)->A;
int n_blocks = ((left_matmul_expr *) node)->n_blocks;
block_left_multiply_vec(A, x->value, node->value, n_blocks);
A->block_left_mult_vec(A, x->value, node->value, n_blocks);
}

static bool is_affine(const expr *node)
Expand All @@ -72,33 +68,32 @@ static bool is_affine(const expr *node)

static void free_type_data(expr *node)
{
left_matmul_expr *lin_node = (left_matmul_expr *) node;
free_csr_matrix(lin_node->A);
free_csr_matrix(lin_node->AT);
free_csc_matrix(lin_node->Jchild_CSC);
free_csc_matrix(lin_node->J_CSC);
free(lin_node->csc_to_csr_workspace);
lin_node->A = NULL;
lin_node->AT = NULL;
lin_node->Jchild_CSC = NULL;
lin_node->J_CSC = NULL;
lin_node->csc_to_csr_workspace = NULL;
left_matmul_expr *lnode = (left_matmul_expr *) node;
free_matrix(lnode->A);
free_matrix(lnode->AT);
free_csc_matrix(lnode->Jchild_CSC);
free_csc_matrix(lnode->J_CSC);
free(lnode->csc_to_csr_work);
lnode->A = NULL;
lnode->AT = NULL;
lnode->Jchild_CSC = NULL;
lnode->J_CSC = NULL;
lnode->csc_to_csr_work = NULL;
}

static void jacobian_init(expr *node)
{
expr *x = node->left;
left_matmul_expr *lin_node = (left_matmul_expr *) node;
left_matmul_expr *lnode = (left_matmul_expr *) node;

/* initialize child's jacobian and precompute sparsity of its CSC */
x->jacobian_init(x);
lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
lnode->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);

/* precompute sparsity of this node's jacobian in CSC and CSR */
lin_node->J_CSC = block_left_multiply_fill_sparsity(
lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks);
node->jacobian =
csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace);
lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
lnode->n_blocks);
node->jacobian = csc_to_csr_fill_sparsity(lnode->J_CSC, lnode->csc_to_csr_work);
}

static void eval_jacobian(expr *node)
Expand All @@ -114,8 +109,8 @@ static void eval_jacobian(expr *node)
csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork);

/* compute this node's jacobian: */
block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC);
csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace);
lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_work);
}

static void wsum_hess_init(expr *node)
Expand All @@ -131,16 +126,16 @@ static void wsum_hess_init(expr *node)

/* work for computing A^T w*/
int n_blocks = ((left_matmul_expr *) node)->n_blocks;
int dim = ((left_matmul_expr *) node)->A->n * n_blocks;
int dim = ((left_matmul_expr *) node)->AT->m * n_blocks;
node->dwork = (double *) malloc(dim * sizeof(double));
}

static void eval_wsum_hess(expr *node, const double *w)
{
/* compute A^T w*/
CSR_Matrix *AT = ((left_matmul_expr *) node)->AT;
Matrix *AT = ((left_matmul_expr *) node)->AT;
int n_blocks = ((left_matmul_expr *) node)->n_blocks;
block_left_multiply_vec(AT, w, node->dwork, n_blocks);
AT->block_left_mult_vec(AT, w, node->dwork, n_blocks);

node->left->eval_wsum_hess(node->left, node->dwork);
memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
Expand Down Expand Up @@ -173,25 +168,64 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A)
}

/* Allocate the type-specific struct */
left_matmul_expr *lin_node =
left_matmul_expr *lnode =
(left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
expr *node = &lin_node->base;
expr *node = &lnode->base;
init_expr(node, d1, d2, u->n_vars, forward, jacobian_init, eval_jacobian,
is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
node->left = u;
expr_retain(u);

/* allocate workspace. iwork is used for transposing A (requiring size A->n)
and for converting J_child csr to csc (requring size node->n_vars).
csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size)
*/
/* allocate workspace. iwork is used for converting J_child csr to csc
(requiring size node->n_vars) and for transposing A (requiring size A->n).
csc_to_csr_work is used for converting J_CSC to CSR (requiring
node->size) */
node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int));
lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
lin_node->n_blocks = n_blocks;
lnode->csc_to_csr_work = (int *) malloc(node->size * sizeof(int));
lnode->n_blocks = n_blocks;

/* store A and AT */
lin_node->A = new_csr(A);
lin_node->AT = transpose(lin_node->A, node->iwork);
lnode->A = new_sparse_matrix(A);
lnode->AT = sparse_matrix_trans((const Sparse_Matrix *) lnode->A, node->iwork);

return node;
}

expr *new_left_matmul_dense(expr *u, int m, int n, const double *data)
{
int d1, d2, n_blocks;
if (u->d1 == n)
{
d1 = m;
d2 = u->d2;
n_blocks = u->d2;
}
else if (u->d2 == n && u->d1 == 1)
{
d1 = 1;
d2 = m;
n_blocks = 1;
}
else
{
fprintf(stderr, "Error in new_left_matmul_dense: dimension mismatch\n");
exit(1);
}

left_matmul_expr *lnode =
(left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
expr *node = &lnode->base;
init_expr(node, d1, d2, u->n_vars, forward, jacobian_init, eval_jacobian,
is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
node->left = u;
expr_retain(u);

node->iwork = (int *) malloc(MAX(n, node->n_vars) * sizeof(int));
lnode->csc_to_csr_work = (int *) malloc(node->size * sizeof(int));
lnode->n_blocks = n_blocks;

lnode->A = new_dense_matrix(m, n, data);
lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);

return node;
}
Loading