From de23e181da0961927fc822aa245d25b4c7f61c5c Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 07:30:15 -0800 Subject: [PATCH 01/13] test for profiling --- CMakeLists.txt | 8 +++++ tests/all_tests.c | 19 +++++++++++ tests/profiling/profile_left_matmul.h | 49 +++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 tests/profiling/profile_left_matmul.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ce9857..e9c4d9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,8 @@ set_property(TARGET dnlp_diff PROPERTY POSITION_INDEPENDENT_CODE ON) # ============================================================================= # C tests (only for standalone builds) # ============================================================================= +option(PROFILE_ONLY "Build only profiling tests" OFF) + if(NOT SKBUILD) include_directories(${PROJECT_SOURCE_DIR}/tests) enable_testing() @@ -90,5 +92,11 @@ if(NOT SKBUILD) tests/test_helpers.c ) target_link_libraries(all_tests dnlp_diff) + + if(PROFILE_ONLY) + target_compile_definitions(all_tests PRIVATE PROFILE_ONLY) + message(STATUS "Building ONLY profiling tests") + endif() + add_test(NAME AllTests COMMAND all_tests) endif() diff --git a/tests/all_tests.c b/tests/all_tests.c index 4e7571a..1887639 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -1,8 +1,11 @@ +all_tests + #include #include "minunit.h" /* Include all test headers */ +#ifndef PROFILE_ONLY #include "forward_pass/affine/test_add.h" #include "forward_pass/affine/test_broadcast.h" #include "forward_pass/affine/test_hstack.h" @@ -41,6 +44,7 @@ #include "jacobian_tests/test_sum.h" #include "jacobian_tests/test_trace.h" #include "jacobian_tests/test_transpose.h" +#include "problem/test_param_prob.h" #include "problem/test_problem.h" #include "utils/test_csc_matrix.h" #include "utils/test_csr_matrix.h" @@ -72,6 +76,11 @@ #include "wsum_hess/test_sum.h" #include "wsum_hess/test_trace.h" #include "wsum_hess/test_transpose.h" +#endif /* PROFILE_ONLY */ + +#ifdef PROFILE_ONLY +#include "profiling/profile_left_matmul.h" +#endif /* PROFILE_ONLY */ int main(void) { @@ -79,6 +88,7 @@ int main(void) int tests_run = 0; +#ifndef PROFILE_ONLY printf("--- Forward Pass Tests ---\n"); mu_run_test(test_variable, tests_run); mu_run_test(test_constant, tests_run); @@ -257,6 +267,15 @@ int main(void) mu_run_test(test_problem_jacobian_multi, tests_run); mu_run_test(test_problem_constraint_forward, tests_run); mu_run_test(test_problem_hessian, tests_run); + mu_run_test(test_param_scalar_mult_problem, tests_run); + mu_run_test(test_param_vector_mult_problem, tests_run); + mu_run_test(test_param_left_matmul_problem, tests_run); +#endif /* PROFILE_ONLY */ + +#ifdef PROFILE_ONLY + printf("\n--- Profiling Tests ---\n"); + mu_run_test(profile_left_matmul, tests_run); +#endif /* PROFILE_ONLY */ printf("\n=== All %d tests passed ===\n", tests_run); diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h new file mode 100644 index 0000000..4ab655b --- /dev/null +++ b/tests/profiling/profile_left_matmul.h @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +#include "affine.h" +#include "bivariate.h" +#include "elementwise_univariate.h" +#include "expr.h" +#include "minunit.h" +#include "test_helpers.h" + +const char *profile_left_matmul() +{ + /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */ + int n = 100; + expr *X = new_variable(n, n, 0, n * n); + CSR_Matrix *A = new_csr_matrix(n, n, n * n); + for (int i = 0; i < n * n; i++) + { + A->x[i] = 1.0; /* dense matrix of all ones */ + } + for (int row = 0; row < n; row++) + { + A->p[row] = row * n; + for (int col = 0; col < n; col++) + { + A->i[row * n + col] = col; + } + } + A->p[n] = n * n; + + expr *AX = new_left_matmul(X, A); + + double *x_vals = (double *) malloc(n * n * sizeof(double)); + for (int i = 0; i < n * n; i++) + { + x_vals[i] = 1.0; + } + + AX->forward(AX, x_vals); + AX->jacobian_init(AX); + AX->eval_jacobian(AX); + + free(x_vals); + free_csr_matrix(A); + free_expr(AX); + return 0; +} From 486c03cd27ff719484d0fdd250295ddd0ef336d1 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 09:59:55 -0800 Subject: [PATCH 02/13] 90 times faster sparsity pattern --- include/subexpr.h | 4 +- include/utils/CSC_Matrix.h | 6 +- include/utils/linalg.h | 23 ++ src/bivariate/left_matmul.c | 17 +- src/bivariate/right_matmul.c | 1 + src/utils/CSC_Matrix.c | 125 +++++------ src/utils/linalg.c | 177 +++++++++++++++ tests/all_tests.c | 17 +- tests/profiling/profile_left_matmul.h | 12 +- tests/utils/test_csr_csc_conversion.h | 172 +++++++++++++++ tests/utils/test_linalg.h | 296 ++++++++++++++++++++++++++ 11 files changed, 775 insertions(+), 75 deletions(-) create mode 100644 include/utils/linalg.h create mode 100644 src/utils/linalg.c create mode 100644 tests/utils/test_csr_csc_conversion.h create mode 100644 tests/utils/test_linalg.h diff --git a/include/subexpr.h b/include/subexpr.h index a663282..ce67048 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -103,13 +103,15 @@ typedef struct elementwise_mult_expr /* Left matrix multiplication: y = A * f(x) where f(x) is an expression. Note that here A does not have global column indices but it is a local matrix. This is an -important distinction compared to linear_op_expr. */ +important distinction compared to ear_op_expr. */ typedef struct left_matmul_expr { expr base; CSR_Matrix *A; CSR_Matrix *AT; CSC_Matrix *CSC_work; + CSR_Matrix *A_temp; // will be replaced by A + int *csr_to_csc_temp; } left_matmul_expr; /* Right matrix multiplication: y = f(x) * A where f(x) is an expression. diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_Matrix.h index ed044dc..de9fcd1 100644 --- a/include/utils/CSC_Matrix.h +++ b/include/utils/CSC_Matrix.h @@ -52,10 +52,8 @@ void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C) CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork); void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork); -/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC - * Precomputes sparsity pattern. No workspace required. - */ -CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B); +CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork); +void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork); /* Fill values of C = A @ B where A is CSR, B is CSC * C must have sparsity pattern already computed diff --git a/include/utils/linalg.h b/include/utils/linalg.h new file mode 100644 index 0000000..07b9711 --- /dev/null +++ b/include/utils/linalg.h @@ -0,0 +1,23 @@ +#ifndef LINALG_H +#define LINALG_H + +/* Forward declarations */ +struct CSR_Matrix; +struct CSC_Matrix; + +/* Compute sparsity pattern for block left multiplication. + * C = [A @ J1; A @ J2; ...; A @ Jp] where A is m x n and J is (n*p) x k. + * J is provided in CSC format and is split into p blocks of n rows each. + * Result C is returned as CSC format with dimensions (m*p) x k. + */ +struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A, + const struct CSC_Matrix *J, + int p); + +/* C = A @ B where A is CSR, B is CSC. Result C is CSR. + * Allocates and precomputes sparsity pattern. No workspace required. + */ +struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A, + const struct CSC_Matrix *B); + +#endif /* LINALG_H */ diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 92dd762..5cd76f3 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -17,6 +17,8 @@ */ #include "bivariate.h" #include "subexpr.h" +#include "utils/Timer.h" +#include "utils/linalg.h" #include #include #include @@ -82,12 +84,19 @@ static void jacobian_init(expr *node) expr *x = node->left; left_matmul_expr *lin_node = (left_matmul_expr *) node; + Timer timer; + /* initialize child's jacobian and precompute sparsity of its transpose */ x->jacobian_init(x); lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); /* precompute sparsity of this node's jacobian */ node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work); + + CSC_Matrix *CSC_temp = block_left_multiply_fill_sparsity( + lin_node->A_temp, lin_node->CSC_work, x->d2); + node->jacobian = csc_to_csr_fill_sparsity(CSC_temp, lin_node->csr_to_csc_temp); + free_csc_matrix(CSC_temp); } static void eval_jacobian(expr *node) @@ -99,7 +108,7 @@ static void eval_jacobian(expr *node) x->eval_jacobian(x); csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork); - /* compute this node's jacobian */ + /* compute this node's jacobian: TODO: should be done without lin_node->A */ csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian); } @@ -164,6 +173,12 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) node->left = u; expr_retain(u); + lin_node->A_temp = new_csr_matrix(A->m, A->n, A->nnz); + memcpy(lin_node->A_temp->p, A->p, (A->m + 1) * sizeof(int)); + memcpy(lin_node->A_temp->i, A->i, A->nnz * sizeof(int)); + memcpy(lin_node->A_temp->x, A->x, A->nnz * sizeof(double)); + lin_node->csr_to_csc_temp = (int *) malloc((A->m * d2) * sizeof(int)); + /* Initialize type-specific fields */ lin_node->A = block_diag_repeat_csr(A, n_blocks); int alloc = MAX(lin_node->A->n, node->n_vars); diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c index 98c08d0..8039318 100644 --- a/src/bivariate/right_matmul.c +++ b/src/bivariate/right_matmul.c @@ -18,6 +18,7 @@ #include "bivariate.h" #include "subexpr.h" #include +#include "utils/linalg.h" /* This file implements the atom 'right_matmul' corresponding to the operation y = f(x) @ A, where A is a given matrix and f(x) is an arbitrary expression. diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c index 2943599..5fe3e99 100644 --- a/src/utils/CSC_Matrix.c +++ b/src/utils/CSC_Matrix.c @@ -306,6 +306,69 @@ void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork) } } +CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork) +{ + CSR_Matrix *C = new_csr_matrix(A->m, A->n, A->nnz); + + int i, j; + int *count = iwork; + memset(count, 0, A->m * sizeof(int)); + + // ------------------------------------------------------------------- + // compute nnz in each row of A + // ------------------------------------------------------------------- + for (i = 0; i < A->n; ++i) + { + for (j = A->p[i]; j < A->p[i + 1]; ++j) + { + count[A->i[j]]++; + } + } + + // ------------------------------------------------------------------ + // compute row pointers + // ------------------------------------------------------------------ + C->p[0] = 0; + for (i = 0; i < A->m; ++i) + { + C->p[i + 1] = C->p[i] + count[i]; + count[i] = C->p[i]; + } + + // ------------------------------------------------------------------ + // fill column indices + // ------------------------------------------------------------------ + for (i = 0; i < A->n; ++i) + { + for (j = A->p[i]; j < A->p[i + 1]; ++j) + { + C->i[count[A->i[j]]] = i; + count[A->i[j]]++; + } + } + + return C; +} + +void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork) +{ + int i, j; + int *count = iwork; + memcpy(count, C->p, A->m * sizeof(int)); + + // ------------------------------------------------------------------ + // fill values + // ------------------------------------------------------------------ + for (i = 0; i < A->n; ++i) + { + for (j = A->p[i]; j < A->p[i + 1]; ++j) + { + C->x[count[A->i[j]]] = A->x[j]; + count[A->i[j]]++; + } + } +} + CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B) { /* A is m x n, B is m x p, C = B^T A is p x n */ @@ -414,24 +477,6 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d, } } -static bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len) -{ - int ai = 0, bi = 0; - while (ai < a_len && bi < b_len) - { - if (a_idx[ai] == b_idx[bi]) return true; - if (a_idx[ai] < b_idx[bi]) - { - ai++; - } - else - { - bi++; - } - } - return false; -} - /* Fill values of C = A @ B where A is CSR, B is CSC. */ void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, CSR_Matrix *C) @@ -454,47 +499,3 @@ void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, } } -/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p) - with precomputed sparsity pattern */ -CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) -{ - int m = A->m; - int p = B->n; - - int len_a, len_b; - - int *Cp = (int *) malloc((m + 1) * sizeof(int)); - iVec *Ci = iVec_new(m); - - Cp[0] = 0; - - // -------------------------------------------------------------- - // countz nnz and fill column indices - // -------------------------------------------------------------- - int nnz = 0; - for (int i = 0; i < A->m; i++) - { - len_a = A->p[i + 1] - A->p[i]; - - for (int j = 0; j < B->n; j++) - { - len_b = B->p[j + 1] - B->p[j]; - - if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b)) - { - iVec_append(Ci, j); - nnz++; - } - } - - Cp[i + 1] = nnz; - } - - CSR_Matrix *C = new_csr_matrix(m, p, nnz); - memcpy(C->p, Cp, (m + 1) * sizeof(int)); - memcpy(C->i, Ci->data, nnz * sizeof(int)); - free(Cp); - iVec_free(Ci); - - return C; -} diff --git a/src/utils/linalg.c b/src/utils/linalg.c new file mode 100644 index 0000000..95ffd2e --- /dev/null +++ b/src/utils/linalg.c @@ -0,0 +1,177 @@ +/* + * Copyright 2026 Daniel Cederberg and William Zhang + * + * This file is part of the DNLP-differentiation-engine project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "utils/CSC_Matrix.h" +#include "utils/CSR_Matrix.h" +#include "utils/iVec.h" +#include +#include +#include +#include +#include + +static inline bool has_overlap(const int *a_idx, int a_len, const int *b_idx, + int b_len, int b_offset) +{ + int ai = 0, bi = 0; + while (ai < a_len && bi < b_len) + { + if (a_idx[ai] == b_idx[bi] - b_offset) return true; + if (a_idx[ai] < b_idx[bi] - b_offset) + { + ai++; + } + else + { + bi++; + } + } + return false; +} + +CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, + const CSC_Matrix *J, int p) +{ + /* A is m x n, J is (n*p) x k, C is (m*p) x k */ + int m = A->m; + int n = A->n; + int k = J->n; + assert(J->m == n * p); + int j, jj, block, block_start, block_end, block_jj_start, block_jj_end, + row_offset; + + /* allocate column pointers and an estimate of row indices */ + int *Cp = (int *) malloc((k + 1) * sizeof(int)); + iVec *Ci = iVec_new(k * m); + Cp[0] = 0; + + /* for each column of j */ + for (j = 0; j < J->n; j++) + { + /* if empty we continue */ + if (J->p[j] == J->p[j + 1]) + { + Cp[j + 1] = Cp[j]; + continue; + } + + /* process each of p blocks of rows in this column of J */ + jj = J->p[j]; + for (block = 0; block < p; block++) + { + + // ----------------------------------------------------------------- + // find start and end indices of rows of J in this block + // ----------------------------------------------------------------- + block_start = block * n; + block_end = block_start + n; + while (jj < J->p[j + 1] && J->i[jj] < block_start) + { + jj++; + } + + block_jj_start = jj; + + while (jj < J->p[j + 1] && J->i[jj] < block_end) + { + jj++; + } + + block_jj_end = jj; + int nnz_in_block = block_jj_end - block_jj_start; + if (nnz_in_block == 0) + { + continue; + } + + + // ----------------------------------------------------------------- + // check which rows of A overlap with the column indices of J in this + // block + // ----------------------------------------------------------------- + row_offset = block * m; + for (int i = 0; i < A->m; i++) + { + int a_len = A->p[i + 1] - A->p[i]; + if (has_overlap(A->i + A->p[i], a_len, J->i + block_jj_start, nnz_in_block, + block_start)) + { + iVec_append(Ci, row_offset + i); + } + } + } + Cp[j + 1] = Ci->len; + } + + /* Allocate result matrix C in CSC format */ + CSC_Matrix *C = new_csc_matrix(m * p, k, Ci->len); + + /* Copy column pointers and row indices */ + memcpy(C->p, Cp, (k + 1) * sizeof(int)); + memcpy(C->i, Ci->data, Ci->len * sizeof(int)); + + /* Clean up workspace */ + free(Cp); + iVec_free(Ci); + + return C; +} + +/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p) + with precomputed sparsity pattern */ +CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) +{ + int m = A->m; + int p = B->n; + + int len_a, len_b; + + int *Cp = (int *) malloc((m + 1) * sizeof(int)); + iVec *Ci = iVec_new(m); + + Cp[0] = 0; + + // -------------------------------------------------------------- + // count nnz and fill column indices + // -------------------------------------------------------------- + int nnz = 0; + for (int i = 0; i < A->m; i++) + { + len_a = A->p[i + 1] - A->p[i]; + + for (int j = 0; j < B->n; j++) + { + len_b = B->p[j + 1] - B->p[j]; + + if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b, 0)) + { + iVec_append(Ci, j); + nnz++; + } + } + + Cp[i + 1] = nnz; + } + + CSR_Matrix *C = new_csr_matrix(m, p, nnz); + memcpy(C->p, Cp, (m + 1) * sizeof(int)); + memcpy(C->i, Ci->data, nnz * sizeof(int)); + free(Cp); + iVec_free(Ci); + + return C; +} diff --git a/tests/all_tests.c b/tests/all_tests.c index 1887639..5246140 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -1,5 +1,3 @@ -all_tests - #include #include "minunit.h" @@ -44,10 +42,11 @@ all_tests #include "jacobian_tests/test_sum.h" #include "jacobian_tests/test_trace.h" #include "jacobian_tests/test_transpose.h" -#include "problem/test_param_prob.h" #include "problem/test_problem.h" #include "utils/test_csc_matrix.h" #include "utils/test_csr_matrix.h" +#include "utils/test_csr_csc_conversion.h" +#include "utils/test_linalg.h" #include "wsum_hess/elementwise/test_entr.h" #include "wsum_hess/elementwise/test_exp.h" #include "wsum_hess/elementwise/test_hyperbolic.h" @@ -249,6 +248,15 @@ int main(void) mu_run_test(test_kron_identity_csr, tests_run); mu_run_test(test_csr_to_csc1, tests_run); mu_run_test(test_csr_to_csc2, tests_run); + mu_run_test(test_csr_to_csc_split, tests_run); + mu_run_test(test_csc_to_csr_sparsity, tests_run); + mu_run_test(test_csc_to_csr_values, tests_run); + mu_run_test(test_csr_csc_csr_roundtrip, tests_run); + mu_run_test(test_block_left_multiply_single_block, tests_run); + mu_run_test(test_block_left_multiply_two_blocks, tests_run); + mu_run_test(test_block_left_multiply_zero_column, tests_run); + mu_run_test(test_csr_csc_matmul_alloc_basic, tests_run); + mu_run_test(test_csr_csc_matmul_alloc_sparse, tests_run); mu_run_test(test_csr_vecmat_values_sparse, tests_run); mu_run_test(test_sum_all_rows_csr, tests_run); mu_run_test(test_sum_block_of_rows_csr, tests_run); @@ -267,9 +275,6 @@ int main(void) mu_run_test(test_problem_jacobian_multi, tests_run); mu_run_test(test_problem_constraint_forward, tests_run); mu_run_test(test_problem_hessian, tests_run); - mu_run_test(test_param_scalar_mult_problem, tests_run); - mu_run_test(test_param_vector_mult_problem, tests_run); - mu_run_test(test_param_left_matmul_problem, tests_run); #endif /* PROFILE_ONLY */ #ifdef PROFILE_ONLY diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h index 4ab655b..f8da868 100644 --- a/tests/profiling/profile_left_matmul.h +++ b/tests/profiling/profile_left_matmul.h @@ -9,6 +9,7 @@ #include "expr.h" #include "minunit.h" #include "test_helpers.h" +#include "utils/Timer.h" const char *profile_left_matmul() { @@ -38,9 +39,18 @@ const char *profile_left_matmul() x_vals[i] = 1.0; } - AX->forward(AX, x_vals); + // should benchmark forward later + //AX->forward(AX, x_vals); + + Timer timer; + clock_gettime(CLOCK_MONOTONIC, &timer.start); AX->jacobian_init(AX); + clock_gettime(CLOCK_MONOTONIC, &timer.end); + printf("left_matmul jacobian init time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); + clock_gettime(CLOCK_MONOTONIC, &timer.start); AX->eval_jacobian(AX); + clock_gettime(CLOCK_MONOTONIC, &timer.end); + printf("left_matmul jacobian eval time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); free(x_vals); free_csr_matrix(A); diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h new file mode 100644 index 0000000..c9e24f7 --- /dev/null +++ b/tests/utils/test_csr_csc_conversion.h @@ -0,0 +1,172 @@ +#include +#include +#include +#include + +#include "minunit.h" +#include "test_helpers.h" +#include "utils/CSC_Matrix.h" +#include "utils/CSR_Matrix.h" + +/* Test CSR to CSC conversion with fill_sparsity and fill_values */ +const char *test_csr_to_csc_split() +{ + /* Create a 4x5 CSR matrix A: + * [1.0 0.0 0.0 0.0 1.0] + * [0.0 0.0 3.0 0.0 0.0] + * [0.0 2.0 0.0 0.0 0.0] + * [0.0 0.0 0.0 4.0 0.0] + */ + CSR_Matrix *A = new_csr_matrix(4, 5, 5); + double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0}; + int Ai[5] = {0, 4, 2, 1, 3}; + int Ap[5] = {0, 2, 3, 4, 5}; + memcpy(A->x, Ax, 5 * sizeof(double)); + memcpy(A->i, Ai, 5 * sizeof(int)); + memcpy(A->p, Ap, 5 * sizeof(int)); + + /* Allocate workspace */ + int *iwork = (int *) malloc(A->n * sizeof(int)); + + /* First, fill sparsity pattern */ + CSC_Matrix *C = csr_to_csc_fill_sparsity(A, iwork); + + /* Check sparsity pattern */ + int Cp_correct[6] = {0, 1, 2, 3, 4, 5}; + int Ci_correct[5] = {0, 2, 1, 3, 0}; + + mu_assert("C col pointers incorrect", cmp_int_array(C->p, Cp_correct, 6)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, Ci_correct, 5)); + + /* Now fill values */ + csr_to_csc_fill_values(A, C, iwork); + + /* Check values */ + double Cx_correct[5] = {1.0, 2.0, 3.0, 4.0, 1.0}; + + mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5)); + + free(iwork); + free_csr_matrix(A); + free_csc_matrix(C); + + return 0; +} + +/* Test CSC to CSR conversion with fill_sparsity */ +const char *test_csc_to_csr_sparsity() +{ + /* Create a 4x5 CSC matrix A: + * [1.0 0.0 0.0 0.0 2.0] + * [0.0 0.0 3.0 0.0 0.0] + * [0.0 4.0 0.0 0.0 0.0] + * [0.0 0.0 0.0 5.0 0.0] + */ + CSC_Matrix *A = new_csc_matrix(4, 5, 5); + double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0}; + int Ai[5] = {0, 2, 1, 3, 0}; + int Ap[6] = {0, 1, 2, 3, 4, 5}; + memcpy(A->x, Ax, 5 * sizeof(double)); + memcpy(A->i, Ai, 5 * sizeof(int)); + memcpy(A->p, Ap, 6 * sizeof(int)); + + /* Allocate workspace */ + int *iwork = (int *) malloc(A->m * sizeof(int)); + + /* Fill sparsity pattern */ + CSR_Matrix *C = csc_to_csr_fill_sparsity(A, iwork); + + /* Expected CSR format: + * Row 0: [1.0 at col 0, 2.0 at col 4] + * Row 1: [3.0 at col 2] + * Row 2: [4.0 at col 1] + * Row 3: [5.0 at col 3] + */ + int Cp_correct[5] = {0, 2, 3, 4, 5}; + int Ci_correct[5] = {0, 4, 2, 1, 3}; + + mu_assert("C row pointers incorrect", cmp_int_array(C->p, Cp_correct, 5)); + mu_assert("C col indices incorrect", cmp_int_array(C->i, Ci_correct, 5)); + mu_assert("C dimensions incorrect", C->m == 4 && C->n == 5); + mu_assert("C nnz incorrect", C->nnz == 5); + + free(iwork); + free_csc_matrix(A); + free_csr_matrix(C); + + return 0; +} + +/* Test CSC to CSR conversion with fill_values */ +const char *test_csc_to_csr_values() +{ + /* Create a 4x5 CSC matrix A */ + CSC_Matrix *A = new_csc_matrix(4, 5, 5); + double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0}; + int Ai[5] = {0, 2, 1, 3, 0}; + int Ap[6] = {0, 1, 2, 3, 4, 5}; + memcpy(A->x, Ax, 5 * sizeof(double)); + memcpy(A->i, Ai, 5 * sizeof(int)); + memcpy(A->p, Ap, 6 * sizeof(int)); + + /* Allocate workspace */ + int *iwork = (int *) malloc(A->m * sizeof(int)); + + /* Fill sparsity pattern */ + CSR_Matrix *C = csc_to_csr_fill_sparsity(A, iwork); + + /* Fill values */ + csc_to_csr_fill_values(A, C, iwork); + + /* Check values */ + double Cx_correct[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + + mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5)); + + free(iwork); + free_csc_matrix(A); + free_csr_matrix(C); + + return 0; +} + +/* Test round-trip conversion: CSR -> CSC -> CSR */ +const char *test_csr_csc_csr_roundtrip() +{ + /* Create a 3x4 CSR matrix A: + * [1.0 2.0 0.0 3.0] + * [0.0 4.0 5.0 0.0] + * [6.0 0.0 7.0 8.0] + */ + CSR_Matrix *A = new_csr_matrix(3, 4, 8); + double Ax[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + int Ai[8] = {0, 1, 3, 1, 2, 0, 2, 3}; + int Ap[4] = {0, 3, 5, 8}; + memcpy(A->x, Ax, 8 * sizeof(double)); + memcpy(A->i, Ai, 8 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* Convert CSR to CSC */ + int *iwork_csc = (int *) malloc(A->n * sizeof(int)); + CSC_Matrix *B = csr_to_csc_fill_sparsity(A, iwork_csc); + csr_to_csc_fill_values(A, B, iwork_csc); + + /* Convert CSC back to CSR */ + int *iwork_csr = (int *) malloc(B->m * sizeof(int)); + CSR_Matrix *C = csc_to_csr_fill_sparsity(B, iwork_csr); + csc_to_csr_fill_values(B, C, iwork_csr); + + /* C should match A */ + mu_assert("Round-trip: vals incorrect", cmp_double_array(C->x, Ax, 8)); + mu_assert("Round-trip: col indices incorrect", cmp_int_array(C->i, Ai, 8)); + mu_assert("Round-trip: row pointers incorrect", cmp_int_array(C->p, Ap, 4)); + + free(iwork_csc); + free(iwork_csr); + free_csr_matrix(A); + free_csc_matrix(B); + free_csr_matrix(C); + + return 0; +} + diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h new file mode 100644 index 0000000..4011455 --- /dev/null +++ b/tests/utils/test_linalg.h @@ -0,0 +1,296 @@ +#include +#include +#include +#include + +#include "minunit.h" +#include "test_helpers.h" +#include "utils/CSC_Matrix.h" +#include "utils/CSR_Matrix.h" +#include "utils/linalg.h" + +/* Test block_left_multiply_fill_sparsity with simple case: single block */ +const char *test_block_left_multiply_single_block() +{ + /* A is 2x3 CSR: + * [1.0 0.0 0.0] + * [0.0 1.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 3); + double Ax[3] = {1.0, 1.0, 1.0}; + int Ai[3] = {0, 1, 2}; + int Ap[4] = {0, 1, 3}; + memcpy(A->x, Ax, 3 * sizeof(double)); + memcpy(A->i, Ai, 3 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* J is 3x2 CSC (single block, so p=1): + * [1.0 0.0] + * [1.0 0.0] + * [0.0 1.0] + */ + CSC_Matrix *J = new_csc_matrix(3, 2, 3); + double Jx[3] = {1.0, 1.0, 1.0}; + int Ji[3] = {0, 1, 2}; + int Jp[3] = {0, 2, 3}; + memcpy(J->x, Jx, 3 * sizeof(double)); + memcpy(J->i, Ji, 3 * sizeof(int)); + memcpy(J->p, Jp, 3 * sizeof(int)); + + /* Compute C = A @ J1 (p=1 means just one block) */ + CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1); + + /* Expected C is 2x2: + * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has row 0) + * C[1,0] = A[1,:] @ J[:,0] = 1.0*1.0 + 1.0*0.0 = 1.0 (row 1 has columns 1,2, J col 0 has row 1) + * C[0,1] = A[0,:] @ J[:,1] = 0.0 (row 0 has column 0, J col 1 has row 2) + * C[1,1] = A[1,:] @ J[:,1] = 1.0*1.0 = 1.0 (row 1 has columns 1,2, J col 1 has row 2) + */ + mu_assert("C->m should be 2", C->m == 2); + mu_assert("C->n should be 2", C->n == 2); + mu_assert("C->nnz should be 3", C->nnz == 3); /* nnz at (0,0), (1,0), (1,1) */ + + /* Check column pointers */ + mu_assert("C->p[0] should be 0", C->p[0] == 0); + mu_assert("C->p[1] should be 2", C->p[1] == 2); /* column 0: rows 0,1 */ + mu_assert("C->p[2] should be 3", C->p[2] == 3); /* column 1: row 1 */ + + /* Check row indices */ + mu_assert("C->i[0] should be 0", C->i[0] == 0); /* (0,0) */ + mu_assert("C->i[1] should be 1", C->i[1] == 1); /* (1,0) */ + mu_assert("C->i[2] should be 1", C->i[2] == 1); /* (1,1) */ + + free_csc_matrix(C); + free_csr_matrix(A); + free_csc_matrix(J); + return NULL; +} + +/* Test block_left_multiply_fill_sparsity with two blocks */ +const char *test_block_left_multiply_two_blocks() +{ + /* A is 2x2 CSR: + * [1.0 0.0] + * [0.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 2, 2); + double Ax[2] = {1.0, 1.0}; + int Ai[2] = {0, 1}; + int Ap[3] = {0, 1, 2}; + memcpy(A->x, Ax, 2 * sizeof(double)); + memcpy(A->i, Ai, 2 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* J is 4x3 CSC (two blocks of 2 rows each): + * Block 1 rows [0,1]: + * [1.0 0.0 0.0] + * [0.0 0.0 0.0] + * Block 2 rows [2,3]: + * [0.0 1.0 0.0] + * [0.0 0.0 1.0] + * So J is: + * [1.0 0.0 0.0] + * [0.0 0.0 0.0] + * [0.0 1.0 0.0] + * [0.0 0.0 1.0] + */ + CSC_Matrix *J = new_csc_matrix(4, 3, 3); + double Jx[3] = {1.0, 1.0, 1.0}; + int Ji[3] = {0, 2, 3}; + int Jp[4] = {0, 1, 2, 3}; + memcpy(J->x, Jx, 3 * sizeof(double)); + memcpy(J->i, Ji, 3 * sizeof(int)); + memcpy(J->p, Jp, 4 * sizeof(int)); + + /* Compute C = [A @ J1; A @ J2] where: + * J1 = [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + * J2 = [[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] + * + * C = [A @ J1; A @ J2] is 4x3: + * A @ J1 = [[1, 0, 0], [0, 0, 0]] (row 0 of A matches col 0 of J1) + * A @ J2 = [[0, 0, 0], [0, 1, 1]] (row 1 of A matches cols 1,2 of J2) + * So C is: + * [1.0 0.0 0.0] + * [0.0 0.0 0.0] + * [0.0 0.0 0.0] + * [0.0 1.0 1.0] + */ + CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 2); + + mu_assert("C->m should be 4", C->m == 4); + mu_assert("C->n should be 3", C->n == 3); + mu_assert("C->nnz should be 3", C->nnz == 3); + + /* Check column pointers */ + mu_assert("C->p[0] should be 0", C->p[0] == 0); + mu_assert("C->p[1] should be 1", C->p[1] == 1); + mu_assert("C->p[2] should be 2", C->p[2] == 2); + mu_assert("C->p[3] should be 3", C->p[3] == 3); + + /* Check row indices */ + mu_assert("C->i[0] should be 0", C->i[0] == 0); + mu_assert("C->i[1] should be 2", C->i[1] == 2); + mu_assert("C->i[2] should be 3", C->i[2] == 3); + + free_csc_matrix(C); + free_csr_matrix(A); + free_csc_matrix(J); + return NULL; +} + +/* Test block_left_multiply_fill_sparsity with all zero column in J */ +const char *test_block_left_multiply_zero_column() +{ + /* A is 2x2 CSR (identity) */ + CSR_Matrix *A = new_csr_matrix(2, 2, 2); + double Ax[2] = {1.0, 1.0}; + int Ai[2] = {0, 1}; + int Ap[3] = {0, 1, 2}; + memcpy(A->x, Ax, 2 * sizeof(double)); + memcpy(A->i, Ai, 2 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* J is 2x2 with an empty column: + * [1.0 0.0] + * [0.0 0.0] + */ + CSC_Matrix *J = new_csc_matrix(2, 2, 1); + double Jx[1] = {1.0}; + int Ji[1] = {0}; + int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */ + memcpy(J->x, Jx, 1 * sizeof(double)); + memcpy(J->i, Ji, 1 * sizeof(int)); + memcpy(J->p, Jp, 3 * sizeof(int)); + + CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1); + + mu_assert("C->m should be 2", C->m == 2); + mu_assert("C->n should be 2", C->n == 2); + mu_assert("C->nnz should be 1", C->nnz == 1); + + mu_assert("C->p[0] should be 0", C->p[0] == 0); + mu_assert("C->p[1] should be 1", C->p[1] == 1); + mu_assert("C->p[2] should be 1", C->p[2] == 1); + + mu_assert("C->i[0] should be 0", C->i[0] == 0); + + free_csc_matrix(C); + free_csr_matrix(A); + free_csc_matrix(J); + return NULL; +} + +/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR and B is CSC */ +const char *test_csr_csc_matmul_alloc_basic() +{ + /* A is 3x2 CSR: + * [1.0 0.0] + * [0.0 1.0] + * [1.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(3, 2, 4); + double Ax[4] = {1.0, 1.0, 1.0, 1.0}; + int Ai[4] = {0, 1, 0, 1}; + int Ap[4] = {0, 1, 2, 4}; + memcpy(A->x, Ax, 4 * sizeof(double)); + memcpy(A->i, Ai, 4 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* B is 2x3 CSC: + * [1.0 0.0 1.0] + * [0.0 1.0 1.0] + */ + CSC_Matrix *B = new_csc_matrix(2, 3, 4); + double Bx[4] = {1.0, 1.0, 1.0, 1.0}; + int Bi[4] = {0, 1, 0, 1}; + int Bp[4] = {0, 1, 2, 4}; + memcpy(B->x, Bx, 4 * sizeof(double)); + memcpy(B->i, Bi, 4 * sizeof(int)); + memcpy(B->p, Bp, 4 * sizeof(int)); + + /* C = A @ B is 3x3: + * C = [[1, 0, 1], + * [0, 1, 1], + * [1, 1, 2]] + */ + CSR_Matrix *C = csr_csc_matmul_alloc(A, B); + + mu_assert("C->m should be 3", C->m == 3); + mu_assert("C->n should be 3", C->n == 3); + mu_assert("C->nnz should be 7", C->nnz == 7); + + /* Check row pointers */ + mu_assert("C->p[0] should be 0", C->p[0] == 0); + mu_assert("C->p[1] should be 2", C->p[1] == 2); + mu_assert("C->p[2] should be 4", C->p[2] == 4); + mu_assert("C->p[3] should be 7", C->p[3] == 7); + + /* Check column indices for row 0: columns 0, 2 */ + mu_assert("C->i[0] should be 0", C->i[0] == 0); + mu_assert("C->i[1] should be 2", C->i[1] == 2); + + /* Check column indices for row 1: columns 1, 2 */ + mu_assert("C->i[2] should be 1", C->i[2] == 1); + mu_assert("C->i[3] should be 2", C->i[3] == 2); + + /* Check column indices for row 2: columns 0, 1, 2 */ + mu_assert("C->i[4] should be 0", C->i[4] == 0); + mu_assert("C->i[5] should be 1", C->i[5] == 1); + mu_assert("C->i[6] should be 2", C->i[6] == 2); + + free_csr_matrix(C); + free_csr_matrix(A); + free_csc_matrix(B); + return NULL; +} + +/* Test csr_csc_matmul_alloc with sparse result */ +const char *test_csr_csc_matmul_alloc_sparse() +{ + /* A is 2x3 CSR: + * [1.0 0.0 0.0] + * [0.0 0.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 2); + double Ax[2] = {1.0, 1.0}; + int Ai[2] = {0, 2}; + int Ap[3] = {0, 1, 2}; + memcpy(A->x, Ax, 2 * sizeof(double)); + memcpy(A->i, Ai, 2 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* B is 3x2 CSC: + * [1.0 0.0] + * [0.0 0.0] + * [0.0 1.0] + */ + CSC_Matrix *B = new_csc_matrix(3, 2, 2); + double Bx[2] = {1.0, 1.0}; + int Bi[2] = {0, 2}; + int Bp[3] = {0, 1, 2}; + memcpy(B->x, Bx, 2 * sizeof(double)); + memcpy(B->i, Bi, 2 * sizeof(int)); + memcpy(B->p, Bp, 3 * sizeof(int)); + + /* C = A @ B is 2x2: + * C = [[1, 0], + * [0, 1]] + */ + CSR_Matrix *C = csr_csc_matmul_alloc(A, B); + + mu_assert("C->m should be 2", C->m == 2); + mu_assert("C->n should be 2", C->n == 2); + mu_assert("C->nnz should be 2", C->nnz == 2); + + mu_assert("C->p[0] should be 0", C->p[0] == 0); + mu_assert("C->p[1] should be 1", C->p[1] == 1); + mu_assert("C->p[2] should be 2", C->p[2] == 2); + + mu_assert("C->i[0] should be 0", C->i[0] == 0); + mu_assert("C->i[1] should be 1", C->i[1] == 1); + + free_csr_matrix(C); + free_csr_matrix(A); + free_csc_matrix(B); + return NULL; +} From f54382c41ea6ae96d7d262b7092668a6878a191a Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 15:07:01 -0800 Subject: [PATCH 03/13] fill values without forming A kron --- include/subexpr.h | 8 +- include/utils/CSC_Matrix.h | 6 -- include/utils/CSR_Matrix.h | 1 + include/utils/linalg.h | 14 +++ src/bivariate/left_matmul.c | 83 +++++++------- src/utils/CSC_Matrix.c | 52 +-------- src/utils/CSR_Matrix.c | 9 ++ src/utils/linalg.c | 150 +++++++++++++++++++++++++- tests/profiling/profile_left_matmul.h | 2 +- tests/utils/test_linalg.h | 91 +++++----------- 10 files changed, 245 insertions(+), 171 deletions(-) diff --git a/include/subexpr.h b/include/subexpr.h index ce67048..e37c161 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -108,10 +108,10 @@ typedef struct left_matmul_expr { expr base; CSR_Matrix *A; - CSR_Matrix *AT; - CSC_Matrix *CSC_work; - CSR_Matrix *A_temp; // will be replaced by A - int *csr_to_csc_temp; + CSR_Matrix *A_kron_T; + CSC_Matrix *Jchild_CSC; + CSC_Matrix *J_CSC; + int *iwork2; /* workspace for csc to csr */ } left_matmul_expr; /* Right matrix multiplication: y = f(x) * A where f(x) is an expression. diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_Matrix.h index de9fcd1..c444dcc 100644 --- a/include/utils/CSC_Matrix.h +++ b/include/utils/CSC_Matrix.h @@ -55,10 +55,4 @@ void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork); CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork); void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork); -/* Fill values of C = A @ B where A is CSR, B is CSC - * C must have sparsity pattern already computed - */ -void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, - CSR_Matrix *C); - #endif /* CSC_MATRIX_H */ diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_Matrix.h index 2e98abf..8c3f523 100644 --- a/include/utils/CSR_Matrix.h +++ b/include/utils/CSR_Matrix.h @@ -27,6 +27,7 @@ typedef struct CSR_Matrix /* Allocate a new CSR matrix with given dimensions and nnz */ CSR_Matrix *new_csr_matrix(int m, int n, int nnz); +CSR_Matrix *new_csr(const CSR_Matrix *A); /* Free a CSR matrix */ void free_csr_matrix(CSR_Matrix *matrix); diff --git a/include/utils/linalg.h b/include/utils/linalg.h index 07b9711..6baf1eb 100644 --- a/include/utils/linalg.h +++ b/include/utils/linalg.h @@ -14,6 +14,20 @@ struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A, const struct CSC_Matrix *J, int p); +/* Fill values for block left multiplication. + * C must have sparsity pattern already computed for [A @ J1; ...; A @ Jp]. + */ +void block_left_multiply_fill_values(const struct CSR_Matrix *A, + const struct CSC_Matrix *J, + struct CSC_Matrix *C); + +/* Fill values of C = A @ B where A is CSR, B is CSC. + * C must have sparsity pattern already computed. + */ +void csr_csc_matmul_fill_values(const struct CSR_Matrix *A, + const struct CSC_Matrix *B, + struct CSR_Matrix *C); + /* C = A @ B where A is CSR, B is CSC. Result C is CSR. * Allocates and precomputes sparsity pattern. No workspace required. */ diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 5cd76f3..21cb444 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -33,7 +33,9 @@ * To compute the forward pass: vec(y) = A_kron @ vec(f(x)), where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p), or more specificely, a block-diagonal matrix with p blocks of A along the - diagonal. + diagonal. In the refactored implementation we don't form A_kron explicitly, + only conceptually. This led to a 100x speedup in the initialization of the + Jacobian sparsity pattern. * To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the Jacobian of f(x) of size (n*p) x n_vars. @@ -44,7 +46,8 @@ Working in terms of A_kron unifies the implementation of f(x) being vector-valued or matrix-valued. - + I (dance858) think we can get additional big speedups when A is dense by + introducing a dense matrix class. */ #include "utils/utils.h" @@ -56,7 +59,7 @@ static void forward(expr *node, const double *u) /* child's forward pass */ node->left->forward(node->left, u); - /* y = A_kron @ vec(f(x)) */ + /* y = A_kron @ vec(f(x)). TODO: this forward pass is wrong */ csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value); } @@ -69,14 +72,13 @@ static void free_type_data(expr *node) { left_matmul_expr *lin_node = (left_matmul_expr *) node; free_csr_matrix(lin_node->A); - free_csr_matrix(lin_node->AT); - if (lin_node->CSC_work) - { - free_csc_matrix(lin_node->CSC_work); - } + free_csr_matrix(lin_node->A_kron_T); + free_csc_matrix(lin_node->Jchild_CSC); + free_csc_matrix(lin_node->J_CSC); lin_node->A = NULL; - lin_node->AT = NULL; - lin_node->CSC_work = NULL; + lin_node->A_kron_T = NULL; + lin_node->Jchild_CSC = NULL; + lin_node->J_CSC = NULL; } static void jacobian_init(expr *node) @@ -84,32 +86,31 @@ static void jacobian_init(expr *node) expr *x = node->left; left_matmul_expr *lin_node = (left_matmul_expr *) node; - Timer timer; - - /* initialize child's jacobian and precompute sparsity of its transpose */ + /* initialize child's jacobian and precompute sparsity of its CSC */ x->jacobian_init(x); - lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); + lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); - /* precompute sparsity of this node's jacobian */ - node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work); - - CSC_Matrix *CSC_temp = block_left_multiply_fill_sparsity( - lin_node->A_temp, lin_node->CSC_work, x->d2); - node->jacobian = csc_to_csr_fill_sparsity(CSC_temp, lin_node->csr_to_csc_temp); - free_csc_matrix(CSC_temp); + /* precompute sparsity of this node's jacobian in CSC and CSR */ + lin_node->J_CSC = + block_left_multiply_fill_sparsity(lin_node->A, lin_node->Jchild_CSC, x->d2); + node->jacobian = csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->iwork2); } static void eval_jacobian(expr *node) { expr *x = node->left; - left_matmul_expr *lin_node = (left_matmul_expr *) node; + left_matmul_expr *lnode = (left_matmul_expr *) node; + + CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC; + CSC_Matrix *J_CSC = lnode->J_CSC; /* evaluate child's jacobian and convert to CSC */ x->eval_jacobian(x); - csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork); + csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork); - /* compute this node's jacobian: TODO: should be done without lin_node->A */ - csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian); + /* compute this node's jacobian: */ + block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC); + csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->iwork2); } static void wsum_hess_init(expr *node) @@ -124,15 +125,15 @@ static void wsum_hess_init(expr *node) memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int)); /* work for computing A^T w*/ - int A_n = ((left_matmul_expr *) node)->A->n; - node->dwork = (double *) malloc(A_n * sizeof(double)); + int dim = ((left_matmul_expr *) node)->A_kron_T->m; + node->dwork = (double *) malloc(dim * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) { /* compute A^T w*/ left_matmul_expr *lin_node = (left_matmul_expr *) node; - csr_matvec_wo_offset(lin_node->AT, w, node->dwork); + csr_matvec_wo_offset(lin_node->A_kron_T, w, node->dwork); node->left->eval_wsum_hess(node->left, node->dwork); memcpy(node->wsum_hess->x, node->left->wsum_hess->x, @@ -141,10 +142,10 @@ static void eval_wsum_hess(expr *node, const double *w) expr *new_left_matmul(expr *u, const CSR_Matrix *A) { - /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users to do - A @ u where u is (n, ) which in C is actually (1, n). In that case the result - of A @ u is (m, ), which is (1, m) according to broadcasting rules. We - therefore check if this is the case. */ + /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users + to do A @ u where u is (n, ) which in C is actually (1, n). In that case + the result of A @ u is (m, ), which is (1, m) according to broadcasting + rules. We therefore check if this is the case. */ int d1, d2, n_blocks; if (u->d1 == A->n) { @@ -173,18 +174,16 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) node->left = u; expr_retain(u); - lin_node->A_temp = new_csr_matrix(A->m, A->n, A->nnz); - memcpy(lin_node->A_temp->p, A->p, (A->m + 1) * sizeof(int)); - memcpy(lin_node->A_temp->i, A->i, A->nnz * sizeof(int)); - memcpy(lin_node->A_temp->x, A->x, A->nnz * sizeof(double)); - lin_node->csr_to_csc_temp = (int *) malloc((A->m * d2) * sizeof(int)); + /* store A */ + lin_node->A = new_csr(A); + lin_node->iwork2 = (int *) malloc((A->m * d2) * sizeof(int)); - /* Initialize type-specific fields */ - lin_node->A = block_diag_repeat_csr(A, n_blocks); - int alloc = MAX(lin_node->A->n, node->n_vars); + /* create kron(I, A)^T */ + int alloc = MAX(A->n * n_blocks, node->n_vars); node->iwork = (int *) malloc(alloc * sizeof(int)); - lin_node->AT = transpose(lin_node->A, node->iwork); - lin_node->CSC_work = NULL; + CSR_Matrix *AT_temp = transpose(lin_node->A, node->iwork); + lin_node->A_kron_T = block_diag_repeat_csr(AT_temp, n_blocks); + free_csr_matrix(AT_temp); return node; } diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c index 5fe3e99..1d9cd25 100644 --- a/src/utils/CSC_Matrix.c +++ b/src/utils/CSC_Matrix.c @@ -139,35 +139,6 @@ static inline double sparse_wdot(const double *a_x, const int *a_i, int a_nnz, return sum; } -/* Unweighted sparse dot product of two sorted index arrays */ -static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz, - const double *b_x, const int *b_i, int b_nnz) -{ - int ii = 0; - int jj = 0; - double sum = 0.0; - - while (ii < a_nnz && jj < b_nnz) - { - if (a_i[ii] == b_i[jj]) - { - sum += a_x[ii] * b_x[jj]; - ii++; - jj++; - } - else if (a_i[ii] < b_i[jj]) - { - ii++; - } - else - { - jj++; - } - } - - return sum; -} - void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C) { int j, ii, jj; @@ -315,7 +286,7 @@ CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork) memset(count, 0, A->m * sizeof(int)); // ------------------------------------------------------------------- - // compute nnz in each row of A + // compute nnz in each row of A, store in count // ------------------------------------------------------------------- for (i = 0; i < A->n; ++i) { @@ -477,25 +448,4 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d, } } -/* Fill values of C = A @ B where A is CSR, B is CSC. */ -void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, - CSR_Matrix *C) -{ - for (int i = 0; i < A->m; i++) - { - for (int jj = C->p[i]; jj < C->p[i + 1]; jj++) - { - int j = C->i[jj]; - - int a_nnz = A->p[i + 1] - A->p[i]; - int b_nnz = B->p[j + 1] - B->p[j]; - - /* Compute dot product of row i of A and column j of B */ - double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz, - B->x + B->p[j], B->i + B->p[j], b_nnz); - - C->x[jj] = sum; - } - } -} diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_Matrix.c index 8b0635b..fe6688a 100644 --- a/src/utils/CSR_Matrix.c +++ b/src/utils/CSR_Matrix.c @@ -36,6 +36,15 @@ CSR_Matrix *new_csr_matrix(int m, int n, int nnz) return matrix; } +CSR_Matrix *new_csr(const CSR_Matrix *A) +{ + CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz); + memcpy(copy->p, A->p, (A->m + 1) * sizeof(int)); + memcpy(copy->i, A->i, A->nnz * sizeof(int)); + memcpy(copy->x, A->x, A->nnz * sizeof(double)); + return copy; +} + void free_csr_matrix(CSR_Matrix *matrix) { if (matrix) diff --git a/src/utils/linalg.c b/src/utils/linalg.c index 95ffd2e..d528ea5 100644 --- a/src/utils/linalg.c +++ b/src/utils/linalg.c @@ -43,6 +43,66 @@ static inline bool has_overlap(const int *a_idx, int a_len, const int *b_idx, return false; } +/* Unweighted sparse dot product of two sorted index arrays */ +static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz, + const double *b_x, const int *b_i, int b_nnz, + int b_offset) +{ + int ii = 0; + int jj = 0; + double sum = 0.0; + + while (ii < a_nnz && jj < b_nnz) + { + if (a_i[ii] == b_i[jj] - b_offset) + { + sum += a_x[ii] * b_x[jj]; + ii++; + jj++; + } + else if (a_i[ii] < b_i[jj] - b_offset) + { + ii++; + } + else + { + jj++; + } + } + + return sum; +} + +static inline double sparse_dot_offset(const double *a_x, const int *a_idx, + int a_nnz, const double *b_x, + const int *b_idx, int b_nnz, int b_offset) +{ + int ii = 0, jj = 0; + double sum = 0.0; + + while (ii < a_nnz && jj < b_nnz) + { + int b_col = b_idx[jj] - b_offset; + + if (a_idx[ii] == b_col) + { + sum += a_x[ii] * b_x[jj]; + ii++; + jj++; + } + else if (a_idx[ii] < b_col) + { + ii++; + } + else + { + jj++; + } + } + + return sum; +} + CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, const CSC_Matrix *J, int p) { @@ -50,7 +110,6 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, int m = A->m; int n = A->n; int k = J->n; - assert(J->m == n * p); int j, jj, block, block_start, block_end, block_jj_start, block_jj_end, row_offset; @@ -97,7 +156,6 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, { continue; } - // ----------------------------------------------------------------- // check which rows of A overlap with the column indices of J in this @@ -107,8 +165,8 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, for (int i = 0; i < A->m; i++) { int a_len = A->p[i + 1] - A->p[i]; - if (has_overlap(A->i + A->p[i], a_len, J->i + block_jj_start, nnz_in_block, - block_start)) + if (has_overlap(A->i + A->p[i], a_len, J->i + block_jj_start, + nnz_in_block, block_start)) { iVec_append(Ci, row_offset + i); } @@ -131,6 +189,90 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, return C; } +void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J, + CSC_Matrix *C) +{ + /* A is m x n, J is (n*p) x k, C is (m*p) x k */ + int m = A->m; + int n = A->n; + int k = J->n; + + int i, j, row_a, block, block_start, block_end, start, end; + + /* to get rid of unitialized warnings */ + block = 0; + block_start = 0; + block_end = 0; + start = 0; + end = 0; + + /* for each column of J (and C) */ + for (j = 0; j < k; j++) + { + int previous_block = -1; + + for (i = C->p[j]; i < C->p[j + 1]; i++) + { + /* choose row of A and block of column of J */ + row_a = C->i[i] % m; + block = C->i[i] / m; + + // ------------------------------------------------------------------------- + // find the part of the column of J in the current block + // ------------------------------------------------------------------------- + if (block != previous_block) + { + previous_block = block; + block_start = block * n; + block_end = block_start + n; + start = J->p[j]; + end = J->p[j + 1]; + + while (start < J->p[j + 1] && J->i[start] < block_start) + { + start++; + } + + while (end > start && J->i[end - 1] >= block_end) + { + end--; + } + } + + // ------------------------------------------------------------------------------ + // compute value as sparse dot product of row of A and column of J in + // this block + // ------------------------------------------------------------------------------ + int a_len = A->p[row_a + 1] - A->p[row_a]; + C->x[i] = + sparse_dot(A->x + A->p[row_a], A->i + A->p[row_a], a_len, + J->x + start, J->i + start, end - start, block_start); + } + } +} + +/* Fill values of C = A @ B where A is CSR, B is CSC. */ +void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, + CSR_Matrix *C) +{ + for (int i = 0; i < A->m; i++) + { + for (int jj = C->p[i]; jj < C->p[i + 1]; jj++) + { + int j = C->i[jj]; + + int a_nnz = A->p[i + 1] - A->p[i]; + int b_nnz = B->p[j + 1] - B->p[j]; + + /* Compute dot product of row i of A and column j of B */ + double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz, + B->x + B->p[j], B->i + B->p[j], b_nnz, 0); + + C->x[jj] = sum; + } + } +} + /* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p) with precomputed sparsity pattern */ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h index f8da868..e64ec5a 100644 --- a/tests/profiling/profile_left_matmul.h +++ b/tests/profiling/profile_left_matmul.h @@ -14,7 +14,7 @@ const char *profile_left_matmul() { /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */ - int n = 100; + int n = 50; expr *X = new_variable(n, n, 0, n * n); CSR_Matrix *A = new_csr_matrix(n, n, n * n); for (int i = 0; i < n * n; i++) diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h index 4011455..15f94ee 100644 --- a/tests/utils/test_linalg.h +++ b/tests/utils/test_linalg.h @@ -46,19 +46,12 @@ const char *test_block_left_multiply_single_block() * C[0,1] = A[0,:] @ J[:,1] = 0.0 (row 0 has column 0, J col 1 has row 2) * C[1,1] = A[1,:] @ J[:,1] = 1.0*1.0 = 1.0 (row 1 has columns 1,2, J col 1 has row 2) */ - mu_assert("C->m should be 2", C->m == 2); - mu_assert("C->n should be 2", C->n == 2); - mu_assert("C->nnz should be 3", C->nnz == 3); /* nnz at (0,0), (1,0), (1,1) */ + int expected_p[3] = {0, 2, 3}; + int expected_i[3] = {0, 1, 1}; - /* Check column pointers */ - mu_assert("C->p[0] should be 0", C->p[0] == 0); - mu_assert("C->p[1] should be 2", C->p[1] == 2); /* column 0: rows 0,1 */ - mu_assert("C->p[2] should be 3", C->p[2] == 3); /* column 1: row 1 */ - - /* Check row indices */ - mu_assert("C->i[0] should be 0", C->i[0] == 0); /* (0,0) */ - mu_assert("C->i[1] should be 1", C->i[1] == 1); /* (1,0) */ - mu_assert("C->i[2] should be 1", C->i[2] == 1); /* (1,1) */ + mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 3); + mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p, 3)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i, 3)); free_csc_matrix(C); free_csr_matrix(A); @@ -116,21 +109,16 @@ const char *test_block_left_multiply_two_blocks() * [0.0 1.0 1.0] */ CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 2); + block_left_multiply_fill_values(A, J, C); - mu_assert("C->m should be 4", C->m == 4); - mu_assert("C->n should be 3", C->n == 3); - mu_assert("C->nnz should be 3", C->nnz == 3); - - /* Check column pointers */ - mu_assert("C->p[0] should be 0", C->p[0] == 0); - mu_assert("C->p[1] should be 1", C->p[1] == 1); - mu_assert("C->p[2] should be 2", C->p[2] == 2); - mu_assert("C->p[3] should be 3", C->p[3] == 3); + int expected_p2[4] = {0, 1, 2, 3}; + int expected_i2[3] = {0, 2, 3}; + double expected_x2[3] = {1.0, 1.0, 1.0}; - /* Check row indices */ - mu_assert("C->i[0] should be 0", C->i[0] == 0); - mu_assert("C->i[1] should be 2", C->i[1] == 2); - mu_assert("C->i[2] should be 3", C->i[2] == 3); + mu_assert("C dims incorrect", C->m == 4 && C->n == 3 && C->nnz == 3); + mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p2, 4)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i2, 3)); + mu_assert("C values incorrect", cmp_double_array(C->x, expected_x2, 3)); free_csc_matrix(C); free_csr_matrix(A); @@ -164,15 +152,12 @@ const char *test_block_left_multiply_zero_column() CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1); - mu_assert("C->m should be 2", C->m == 2); - mu_assert("C->n should be 2", C->n == 2); - mu_assert("C->nnz should be 1", C->nnz == 1); + int expected_p3[3] = {0, 1, 1}; + int expected_i3[1] = {0}; - mu_assert("C->p[0] should be 0", C->p[0] == 0); - mu_assert("C->p[1] should be 1", C->p[1] == 1); - mu_assert("C->p[2] should be 1", C->p[2] == 1); - - mu_assert("C->i[0] should be 0", C->i[0] == 0); + mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 1); + mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p3, 3)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i3, 1)); free_csc_matrix(C); free_csr_matrix(A); @@ -215,28 +200,12 @@ const char *test_csr_csc_matmul_alloc_basic() */ CSR_Matrix *C = csr_csc_matmul_alloc(A, B); - mu_assert("C->m should be 3", C->m == 3); - mu_assert("C->n should be 3", C->n == 3); - mu_assert("C->nnz should be 7", C->nnz == 7); - - /* Check row pointers */ - mu_assert("C->p[0] should be 0", C->p[0] == 0); - mu_assert("C->p[1] should be 2", C->p[1] == 2); - mu_assert("C->p[2] should be 4", C->p[2] == 4); - mu_assert("C->p[3] should be 7", C->p[3] == 7); + int expected_p4[4] = {0, 2, 4, 7}; + int expected_i4[7] = {0, 2, 1, 2, 0, 1, 2}; - /* Check column indices for row 0: columns 0, 2 */ - mu_assert("C->i[0] should be 0", C->i[0] == 0); - mu_assert("C->i[1] should be 2", C->i[1] == 2); - - /* Check column indices for row 1: columns 1, 2 */ - mu_assert("C->i[2] should be 1", C->i[2] == 1); - mu_assert("C->i[3] should be 2", C->i[3] == 2); - - /* Check column indices for row 2: columns 0, 1, 2 */ - mu_assert("C->i[4] should be 0", C->i[4] == 0); - mu_assert("C->i[5] should be 1", C->i[5] == 1); - mu_assert("C->i[6] should be 2", C->i[6] == 2); + mu_assert("C dims incorrect", C->m == 3 && C->n == 3 && C->nnz == 7); + mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p4, 4)); + mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i4, 7)); free_csr_matrix(C); free_csr_matrix(A); @@ -278,16 +247,12 @@ const char *test_csr_csc_matmul_alloc_sparse() */ CSR_Matrix *C = csr_csc_matmul_alloc(A, B); - mu_assert("C->m should be 2", C->m == 2); - mu_assert("C->n should be 2", C->n == 2); - mu_assert("C->nnz should be 2", C->nnz == 2); - - mu_assert("C->p[0] should be 0", C->p[0] == 0); - mu_assert("C->p[1] should be 1", C->p[1] == 1); - mu_assert("C->p[2] should be 2", C->p[2] == 2); + int expected_p5[3] = {0, 1, 2}; + int expected_i5[2] = {0, 1}; - mu_assert("C->i[0] should be 0", C->i[0] == 0); - mu_assert("C->i[1] should be 1", C->i[1] == 1); + mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 2); + mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p5, 3)); + mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i5, 2)); free_csr_matrix(C); free_csr_matrix(A); From 37fc13a1700233a2404b30575a2a4642123245ae Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 15:17:08 -0800 Subject: [PATCH 04/13] update forward pass --- include/utils/linalg.h | 7 ++ src/bivariate/left_matmul.c | 7 +- src/utils/linalg.c | 33 +++++++++ tests/all_tests.c | 4 ++ tests/utils/test_linalg.h | 133 ++++++++++++++++++++++++++++++++++++ 5 files changed, 181 insertions(+), 3 deletions(-) diff --git a/include/utils/linalg.h b/include/utils/linalg.h index 6baf1eb..2b191c2 100644 --- a/include/utils/linalg.h +++ b/include/utils/linalg.h @@ -34,4 +34,11 @@ void csr_csc_matmul_fill_values(const struct CSR_Matrix *A, struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A, const struct CSC_Matrix *B); +/* Compute block-wise matrix-vector products. + * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is divided into p blocks of n elements. + * A is provided in CSR format. x is (n*p)-length vector, y is (m*p)-length output. + */ +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, + double *y, int p); + #endif /* LINALG_H */ diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 21cb444..f8842ce 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -34,7 +34,7 @@ where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p), or more specificely, a block-diagonal matrix with p blocks of A along the diagonal. In the refactored implementation we don't form A_kron explicitly, - only conceptually. This led to a 100x speedup in the initialization of the + only conceptually. This led to a 100x speedup in the initialization of the Jacobian sparsity pattern. * To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the @@ -59,8 +59,9 @@ static void forward(expr *node, const double *u) /* child's forward pass */ node->left->forward(node->left, u); - /* y = A_kron @ vec(f(x)). TODO: this forward pass is wrong */ - csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value); + /* y = A_kron @ vec(f(x)) */ + CSR_Matrix *A = ((left_matmul_expr *) node)->A; + block_left_multiply_vec(A, x->value, node->value, x->d2); } static bool is_affine(const expr *node) diff --git a/src/utils/linalg.c b/src/utils/linalg.c index d528ea5..7e1fb4e 100644 --- a/src/utils/linalg.c +++ b/src/utils/linalg.c @@ -317,3 +317,36 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) return C; } + +/* Compute block-wise matrix-vector products. + * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is (n*p)-length vector. + * x is split into p blocks of n elements each. + */ +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, + double *y, int p) +{ + assert(A != NULL && x != NULL && y != NULL && p > 0); + + /* For each block */ + for (int block = 0; block < p; block++) + { + int block_start = block * A->n; + int y_offset = block * A->m; + + /* For each row of A */ + for (int i = 0; i < A->m; i++) + { + double row_sum = 0.0; + int row_nnz = A->p[i + 1] - A->p[i]; + + /* Compute sparse dot product of A[i,:] with x_block */ + for (int idx = 0; idx < row_nnz; idx++) + { + int col = A->i[A->p[i] + idx]; + row_sum += A->x[A->p[i] + idx] * x[block_start + col]; + } + + y[y_offset + i] = row_sum; + } + } +} diff --git a/tests/all_tests.c b/tests/all_tests.c index 5246140..f923bd3 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -257,6 +257,10 @@ int main(void) mu_run_test(test_block_left_multiply_zero_column, tests_run); mu_run_test(test_csr_csc_matmul_alloc_basic, tests_run); mu_run_test(test_csr_csc_matmul_alloc_sparse, tests_run); + mu_run_test(test_block_left_multiply_vec_single_block, tests_run); + mu_run_test(test_block_left_multiply_vec_two_blocks, tests_run); + mu_run_test(test_block_left_multiply_vec_sparse, tests_run); + mu_run_test(test_block_left_multiply_vec_three_blocks, tests_run); mu_run_test(test_csr_vecmat_values_sparse, tests_run); mu_run_test(test_sum_all_rows_csr, tests_run); mu_run_test(test_sum_block_of_rows_csr, tests_run); diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h index 15f94ee..e35b481 100644 --- a/tests/utils/test_linalg.h +++ b/tests/utils/test_linalg.h @@ -259,3 +259,136 @@ const char *test_csr_csc_matmul_alloc_sparse() free_csc_matrix(B); return NULL; } + +/* Test block_left_multiply_vec with single block: y = A @ x */ +const char *test_block_left_multiply_vec_single_block() +{ + /* A is 2x3 CSR: + * [1.0 0.0 2.0] + * [0.0 3.0 0.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 3); + double Ax[3] = {1.0, 3.0, 2.0}; + int Ai[3] = {0, 1, 2}; + int Ap[3] = {0, 2, 3}; + memcpy(A->x, Ax, 3 * sizeof(double)); + memcpy(A->i, Ai, 3 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* x is (3*1)-length vector = [1.0, 2.0, 3.0] */ + double x[3] = {1.0, 2.0, 3.0}; + double y[2] = {0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 1); + + /* Expected: y = [1.0*1.0 + 0.0*2.0 + 2.0*3.0, 0.0*1.0 + 3.0*2.0 + 0.0*3.0] + * = [1.0 + 6.0, 6.0] + * = [7.0, 6.0] + */ + double expected_y[2] = {7.0, 6.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 2)); + + free_csr_matrix(A); + return NULL; +} + +/* Test block_left_multiply_vec with two blocks: y = [A @ x1; A @ x2] */ +const char *test_block_left_multiply_vec_two_blocks() +{ + /* A is 2x3 CSR: + * [1.0 2.0 0.0] + * [0.0 3.0 4.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 4); + double Ax[4] = {1.0, 2.0, 3.0, 4.0}; + int Ai[4] = {0, 1, 1, 2}; + int Ap[3] = {0, 2, 4}; + memcpy(A->x, Ax, 4 * sizeof(double)); + memcpy(A->i, Ai, 4 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* x is (3*2)-length vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + * x1 = [1.0, 2.0, 3.0], x2 = [4.0, 5.0, 6.0] + */ + double x[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + double y[4] = {0.0, 0.0, 0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 2); + + /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*2.0 + 4.0*3.0] = [5.0, 18.0] + * Expected block 2: y[2:4] = A @ x2 = [1.0*4.0 + 2.0*5.0, 3.0*5.0 + 4.0*6.0] = [14.0, 39.0] + */ + double expected_y[4] = {5.0, 18.0, 14.0, 39.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 4)); + + free_csr_matrix(A); + return NULL; +} + +/* Test block_left_multiply_vec with sparse matrix and multiple blocks */ +const char *test_block_left_multiply_vec_sparse() +{ + /* A is 3x4 CSR (very sparse): + * [2.0 0.0 0.0 0.0] + * [0.0 0.0 3.0 0.0] + * [0.0 0.0 0.0 4.0] + */ + CSR_Matrix *A = new_csr_matrix(3, 4, 3); + double Ax[3] = {2.0, 3.0, 4.0}; + int Ai[3] = {0, 2, 3}; + int Ap[4] = {0, 1, 2, 3}; + memcpy(A->x, Ax, 3 * sizeof(double)); + memcpy(A->i, Ai, 3 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* x is (4*2)-length = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] + * x1 = [1.0, 2.0, 3.0, 4.0], x2 = [5.0, 6.0, 7.0, 8.0] + */ + double x[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + double y[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 2); + + /* Expected block 1: y[0:3] = A @ x1 = [2.0*1.0, 3.0*3.0, 4.0*4.0] = [2.0, 9.0, 16.0] + * Expected block 2: y[3:6] = A @ x2 = [2.0*5.0, 3.0*7.0, 4.0*8.0] = [10.0, 21.0, 32.0] + */ + double expected_y[6] = {2.0, 9.0, 16.0, 10.0, 21.0, 32.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6)); + + free_csr_matrix(A); + return NULL; +} + +/* Test block_left_multiply_vec with three blocks */ +const char *test_block_left_multiply_vec_three_blocks() +{ + /* A is 2x2 CSR: + * [1.0 2.0] + * [3.0 4.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 2, 4); + double Ax[4] = {1.0, 2.0, 3.0, 4.0}; + int Ai[4] = {0, 1, 0, 1}; + int Ap[3] = {0, 2, 4}; + memcpy(A->x, Ax, 4 * sizeof(double)); + memcpy(A->i, Ai, 4 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* x is (2*3)-length = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + * x1 = [1.0, 2.0], x2 = [3.0, 4.0], x3 = [5.0, 6.0] + */ + double x[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + double y[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 3); + + /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*1.0 + 4.0*2.0] = [5.0, 11.0] + * Expected block 2: y[2:4] = A @ x2 = [1.0*3.0 + 2.0*4.0, 3.0*3.0 + 4.0*4.0] = [11.0, 25.0] + * Expected block 3: y[4:6] = A @ x3 = [1.0*5.0 + 2.0*6.0, 3.0*5.0 + 4.0*6.0] = [17.0, 39.0] + */ + double expected_y[6] = {5.0, 11.0, 11.0, 25.0, 17.0, 39.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6)); + + free_csr_matrix(A); + return NULL; +} From 3bd25ceb785fce2aa450438b37c04b1bb600296d Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 15:20:33 -0800 Subject: [PATCH 05/13] fix test --- src/bivariate/left_matmul.c | 2 ++ tests/utils/test_linalg.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index f8842ce..d1d5350 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -76,10 +76,12 @@ static void free_type_data(expr *node) free_csr_matrix(lin_node->A_kron_T); free_csc_matrix(lin_node->Jchild_CSC); free_csc_matrix(lin_node->J_CSC); + free(lin_node->iwork2); lin_node->A = NULL; lin_node->A_kron_T = NULL; lin_node->Jchild_CSC = NULL; lin_node->J_CSC = NULL; + lin_node->iwork2 = NULL; } static void jacobian_init(expr *node) diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h index e35b481..6a16d63 100644 --- a/tests/utils/test_linalg.h +++ b/tests/utils/test_linalg.h @@ -19,10 +19,10 @@ const char *test_block_left_multiply_single_block() CSR_Matrix *A = new_csr_matrix(2, 3, 3); double Ax[3] = {1.0, 1.0, 1.0}; int Ai[3] = {0, 1, 2}; - int Ap[4] = {0, 1, 3}; + int Ap[3] = {0, 1, 3}; memcpy(A->x, Ax, 3 * sizeof(double)); memcpy(A->i, Ai, 3 * sizeof(int)); - memcpy(A->p, Ap, 4 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); /* J is 3x2 CSC (single block, so p=1): * [1.0 0.0] From 40f85e0a7fffb2ae0634120c06c9adcd02007e39 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 15:22:44 -0800 Subject: [PATCH 06/13] ran formatter --- include/subexpr.h | 2 +- include/utils/linalg.h | 14 ++++++------- src/bivariate/right_matmul.c | 2 +- src/utils/CSC_Matrix.c | 2 -- src/utils/linalg.c | 8 +++---- tests/all_tests.c | 10 ++++----- tests/profiling/profile_left_matmul.h | 12 ++++++----- tests/utils/test_csr_csc_conversion.h | 1 - tests/utils/test_linalg.h | 30 +++++++++++++++------------ 9 files changed, 42 insertions(+), 39 deletions(-) diff --git a/include/subexpr.h b/include/subexpr.h index e37c161..31d04f5 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -110,7 +110,7 @@ typedef struct left_matmul_expr CSR_Matrix *A; CSR_Matrix *A_kron_T; CSC_Matrix *Jchild_CSC; - CSC_Matrix *J_CSC; + CSC_Matrix *J_CSC; int *iwork2; /* workspace for csc to csr */ } left_matmul_expr; diff --git a/include/utils/linalg.h b/include/utils/linalg.h index 2b191c2..ec7419f 100644 --- a/include/utils/linalg.h +++ b/include/utils/linalg.h @@ -25,20 +25,20 @@ void block_left_multiply_fill_values(const struct CSR_Matrix *A, * C must have sparsity pattern already computed. */ void csr_csc_matmul_fill_values(const struct CSR_Matrix *A, - const struct CSC_Matrix *B, - struct CSR_Matrix *C); + const struct CSC_Matrix *B, struct CSR_Matrix *C); /* C = A @ B where A is CSR, B is CSC. Result C is CSR. * Allocates and precomputes sparsity pattern. No workspace required. */ struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A, - const struct CSC_Matrix *B); + const struct CSC_Matrix *B); /* Compute block-wise matrix-vector products. - * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is divided into p blocks of n elements. - * A is provided in CSR format. x is (n*p)-length vector, y is (m*p)-length output. + * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is divided into p blocks + * of n elements. A is provided in CSR format. x is (n*p)-length vector, y is + * (m*p)-length output. */ -void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, - double *y, int p); +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, + int p); #endif /* LINALG_H */ diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c index 8039318..81cb317 100644 --- a/src/bivariate/right_matmul.c +++ b/src/bivariate/right_matmul.c @@ -17,8 +17,8 @@ */ #include "bivariate.h" #include "subexpr.h" -#include #include "utils/linalg.h" +#include /* This file implements the atom 'right_matmul' corresponding to the operation y = f(x) @ A, where A is a given matrix and f(x) is an arbitrary expression. diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c index 1d9cd25..1e0d435 100644 --- a/src/utils/CSC_Matrix.c +++ b/src/utils/CSC_Matrix.c @@ -447,5 +447,3 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d, } } } - - diff --git a/src/utils/linalg.c b/src/utils/linalg.c index 7e1fb4e..c3a9d21 100644 --- a/src/utils/linalg.c +++ b/src/utils/linalg.c @@ -196,9 +196,9 @@ void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J, int m = A->m; int n = A->n; int k = J->n; - + int i, j, row_a, block, block_start, block_end, start, end; - + /* to get rid of unitialized warnings */ block = 0; block_start = 0; @@ -322,8 +322,8 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is (n*p)-length vector. * x is split into p blocks of n elements each. */ -void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, - double *y, int p) +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, + int p) { assert(A != NULL && x != NULL && y != NULL && p > 0); diff --git a/tests/all_tests.c b/tests/all_tests.c index f923bd3..8e050de 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -44,8 +44,8 @@ #include "jacobian_tests/test_transpose.h" #include "problem/test_problem.h" #include "utils/test_csc_matrix.h" -#include "utils/test_csr_matrix.h" #include "utils/test_csr_csc_conversion.h" +#include "utils/test_csr_matrix.h" #include "utils/test_linalg.h" #include "wsum_hess/elementwise/test_entr.h" #include "wsum_hess/elementwise/test_exp.h" @@ -75,11 +75,11 @@ #include "wsum_hess/test_sum.h" #include "wsum_hess/test_trace.h" #include "wsum_hess/test_transpose.h" -#endif /* PROFILE_ONLY */ +#endif /* PROFILE_ONLY */ #ifdef PROFILE_ONLY #include "profiling/profile_left_matmul.h" -#endif /* PROFILE_ONLY */ +#endif /* PROFILE_ONLY */ int main(void) { @@ -279,12 +279,12 @@ int main(void) mu_run_test(test_problem_jacobian_multi, tests_run); mu_run_test(test_problem_constraint_forward, tests_run); mu_run_test(test_problem_hessian, tests_run); -#endif /* PROFILE_ONLY */ +#endif /* PROFILE_ONLY */ #ifdef PROFILE_ONLY printf("\n--- Profiling Tests ---\n"); mu_run_test(profile_left_matmul, tests_run); -#endif /* PROFILE_ONLY */ +#endif /* PROFILE_ONLY */ printf("\n=== All %d tests passed ===\n", tests_run); diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h index e64ec5a..9865b8a 100644 --- a/tests/profiling/profile_left_matmul.h +++ b/tests/profiling/profile_left_matmul.h @@ -1,7 +1,7 @@ #include #include -#include #include +#include #include "affine.h" #include "bivariate.h" @@ -40,18 +40,20 @@ const char *profile_left_matmul() } // should benchmark forward later - //AX->forward(AX, x_vals); + // AX->forward(AX, x_vals); Timer timer; clock_gettime(CLOCK_MONOTONIC, &timer.start); AX->jacobian_init(AX); clock_gettime(CLOCK_MONOTONIC, &timer.end); - printf("left_matmul jacobian init time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); + printf("left_matmul jacobian init time: %8.3f seconds\n", + GET_ELAPSED_SECONDS(timer)); clock_gettime(CLOCK_MONOTONIC, &timer.start); AX->eval_jacobian(AX); clock_gettime(CLOCK_MONOTONIC, &timer.end); - printf("left_matmul jacobian eval time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); - + printf("left_matmul jacobian eval time: %8.3f seconds\n", + GET_ELAPSED_SECONDS(timer)); + free(x_vals); free_csr_matrix(A); free_expr(AX); diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h index c9e24f7..c7daeb6 100644 --- a/tests/utils/test_csr_csc_conversion.h +++ b/tests/utils/test_csr_csc_conversion.h @@ -169,4 +169,3 @@ const char *test_csr_csc_csr_roundtrip() return 0; } - diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h index 6a16d63..e7a6896 100644 --- a/tests/utils/test_linalg.h +++ b/tests/utils/test_linalg.h @@ -41,10 +41,11 @@ const char *test_block_left_multiply_single_block() CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1); /* Expected C is 2x2: - * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has row 0) - * C[1,0] = A[1,:] @ J[:,0] = 1.0*1.0 + 1.0*0.0 = 1.0 (row 1 has columns 1,2, J col 0 has row 1) - * C[0,1] = A[0,:] @ J[:,1] = 0.0 (row 0 has column 0, J col 1 has row 2) - * C[1,1] = A[1,:] @ J[:,1] = 1.0*1.0 = 1.0 (row 1 has columns 1,2, J col 1 has row 2) + * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has + * row 0) C[1,0] = A[1,:] @ J[:,0] = 1.0*1.0 + 1.0*0.0 = 1.0 (row 1 has columns + * 1,2, J col 0 has row 1) C[0,1] = A[0,:] @ J[:,1] = 0.0 (row 0 has column 0, J + * col 1 has row 2) C[1,1] = A[1,:] @ J[:,1] = 1.0*1.0 = 1.0 (row 1 has columns + * 1,2, J col 1 has row 2) */ int expected_p[3] = {0, 2, 3}; int expected_i[3] = {0, 1, 1}; @@ -98,7 +99,7 @@ const char *test_block_left_multiply_two_blocks() /* Compute C = [A @ J1; A @ J2] where: * J1 = [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]] * J2 = [[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] - * + * * C = [A @ J1; A @ J2] is 4x3: * A @ J1 = [[1, 0, 0], [0, 0, 0]] (row 0 of A matches col 0 of J1) * A @ J2 = [[0, 0, 0], [0, 1, 1]] (row 1 of A matches cols 1,2 of J2) @@ -145,7 +146,7 @@ const char *test_block_left_multiply_zero_column() CSC_Matrix *J = new_csc_matrix(2, 2, 1); double Jx[1] = {1.0}; int Ji[1] = {0}; - int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */ + int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */ memcpy(J->x, Jx, 1 * sizeof(double)); memcpy(J->i, Ji, 1 * sizeof(int)); memcpy(J->p, Jp, 3 * sizeof(int)); @@ -315,8 +316,9 @@ const char *test_block_left_multiply_vec_two_blocks() block_left_multiply_vec(A, x, y, 2); - /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*2.0 + 4.0*3.0] = [5.0, 18.0] - * Expected block 2: y[2:4] = A @ x2 = [1.0*4.0 + 2.0*5.0, 3.0*5.0 + 4.0*6.0] = [14.0, 39.0] + /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*2.0 + 4.0*3.0] = + * [5.0, 18.0] Expected block 2: y[2:4] = A @ x2 = [1.0*4.0 + 2.0*5.0, 3.0*5.0 + * + 4.0*6.0] = [14.0, 39.0] */ double expected_y[4] = {5.0, 18.0, 14.0, 39.0}; mu_assert("y values incorrect", cmp_double_array(y, expected_y, 4)); @@ -349,8 +351,9 @@ const char *test_block_left_multiply_vec_sparse() block_left_multiply_vec(A, x, y, 2); - /* Expected block 1: y[0:3] = A @ x1 = [2.0*1.0, 3.0*3.0, 4.0*4.0] = [2.0, 9.0, 16.0] - * Expected block 2: y[3:6] = A @ x2 = [2.0*5.0, 3.0*7.0, 4.0*8.0] = [10.0, 21.0, 32.0] + /* Expected block 1: y[0:3] = A @ x1 = [2.0*1.0, 3.0*3.0, 4.0*4.0] = + * [2.0, 9.0, 16.0] Expected block 2: y[3:6] = A @ x2 = + * [2.0*5.0, 3.0*7.0, 4.0*8.0] = [10.0, 21.0, 32.0] */ double expected_y[6] = {2.0, 9.0, 16.0, 10.0, 21.0, 32.0}; mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6)); @@ -382,9 +385,10 @@ const char *test_block_left_multiply_vec_three_blocks() block_left_multiply_vec(A, x, y, 3); - /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*1.0 + 4.0*2.0] = [5.0, 11.0] - * Expected block 2: y[2:4] = A @ x2 = [1.0*3.0 + 2.0*4.0, 3.0*3.0 + 4.0*4.0] = [11.0, 25.0] - * Expected block 3: y[4:6] = A @ x3 = [1.0*5.0 + 2.0*6.0, 3.0*5.0 + 4.0*6.0] = [17.0, 39.0] + /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*1.0 + 4.0*2.0] = + * [5.0, 11.0] Expected block 2: y[2:4] = A @ x2 = [1.0*3.0 + 2.0*4.0, 3.0*3.0 + * + 4.0*4.0] = [11.0, 25.0] Expected block 3: y[4:6] = A @ x3 = [1.0*5.0 + * + 2.0*6.0, 3.0*5.0 + 4.0*6.0] = [17.0, 39.0] */ double expected_y[6] = {5.0, 11.0, 11.0, 25.0, 17.0, 39.0}; mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6)); From 7a1bd4e711a5dd74014e1e3aee42e51249bef3a5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 15:25:10 -0800 Subject: [PATCH 07/13] profile forward pass --- src/utils/linalg.c | 2 -- tests/profiling/profile_left_matmul.h | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/utils/linalg.c b/src/utils/linalg.c index c3a9d21..82c03e0 100644 --- a/src/utils/linalg.c +++ b/src/utils/linalg.c @@ -325,8 +325,6 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, int p) { - assert(A != NULL && x != NULL && y != NULL && p > 0); - /* For each block */ for (int block = 0; block < p; block++) { diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h index 9865b8a..e477c43 100644 --- a/tests/profiling/profile_left_matmul.h +++ b/tests/profiling/profile_left_matmul.h @@ -14,7 +14,7 @@ const char *profile_left_matmul() { /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */ - int n = 50; + int n = 100; expr *X = new_variable(n, n, 0, n * n); CSR_Matrix *A = new_csr_matrix(n, n, n * n); for (int i = 0; i < n * n; i++) @@ -39,10 +39,11 @@ const char *profile_left_matmul() x_vals[i] = 1.0; } - // should benchmark forward later - // AX->forward(AX, x_vals); - Timer timer; + clock_gettime(CLOCK_MONOTONIC, &timer.start); + AX->forward(AX, x_vals); + clock_gettime(CLOCK_MONOTONIC, &timer.end); + printf("left_matmul forward time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); clock_gettime(CLOCK_MONOTONIC, &timer.start); AX->jacobian_init(AX); clock_gettime(CLOCK_MONOTONIC, &timer.end); From f3bd95ffb1d1ac47fed9533bcbb083484860d790 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 12 Feb 2026 15:26:42 -0800 Subject: [PATCH 08/13] ran formatter --- tests/profiling/profile_left_matmul.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h index e477c43..a8e819b 100644 --- a/tests/profiling/profile_left_matmul.h +++ b/tests/profiling/profile_left_matmul.h @@ -40,7 +40,7 @@ const char *profile_left_matmul() } Timer timer; - clock_gettime(CLOCK_MONOTONIC, &timer.start); + clock_gettime(CLOCK_MONOTONIC, &timer.start); AX->forward(AX, x_vals); clock_gettime(CLOCK_MONOTONIC, &timer.end); printf("left_matmul forward time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); From d6687087c448b79a85a13750cbfa41a15cc897ec Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 13 Feb 2026 05:18:25 -0800 Subject: [PATCH 09/13] removed kroenecker product from hessian --- include/subexpr.h | 1 + src/bivariate/left_matmul.c | 28 ++++++++++++---------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/include/subexpr.h b/include/subexpr.h index 31d04f5..b73688f 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -108,6 +108,7 @@ typedef struct left_matmul_expr { expr base; CSR_Matrix *A; + CSR_Matrix *AT; CSR_Matrix *A_kron_T; CSC_Matrix *Jchild_CSC; CSC_Matrix *J_CSC; diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index d1d5350..2782627 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -73,12 +73,12 @@ static void free_type_data(expr *node) { left_matmul_expr *lin_node = (left_matmul_expr *) node; free_csr_matrix(lin_node->A); - free_csr_matrix(lin_node->A_kron_T); + free_csr_matrix(lin_node->AT); free_csc_matrix(lin_node->Jchild_CSC); free_csc_matrix(lin_node->J_CSC); free(lin_node->iwork2); lin_node->A = NULL; - lin_node->A_kron_T = NULL; + lin_node->AT = NULL; lin_node->Jchild_CSC = NULL; lin_node->J_CSC = NULL; lin_node->iwork2 = NULL; @@ -128,15 +128,15 @@ static void wsum_hess_init(expr *node) memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int)); /* work for computing A^T w*/ - int dim = ((left_matmul_expr *) node)->A_kron_T->m; + int dim = ((left_matmul_expr *) node)->AT->m * node->d2; node->dwork = (double *) malloc(dim * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) { /* compute A^T w*/ - left_matmul_expr *lin_node = (left_matmul_expr *) node; - csr_matvec_wo_offset(lin_node->A_kron_T, w, node->dwork); + CSR_Matrix *AT = ((left_matmul_expr *) node)->AT; + block_left_multiply_vec(AT, w, node->dwork, node->d2); node->left->eval_wsum_hess(node->left, node->dwork); memcpy(node->wsum_hess->x, node->left->wsum_hess->x, @@ -149,18 +149,18 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) to do A @ u where u is (n, ) which in C is actually (1, n). In that case the result of A @ u is (m, ), which is (1, m) according to broadcasting rules. We therefore check if this is the case. */ - int d1, d2, n_blocks; + int d1, d2; //n_blocks; if (u->d1 == A->n) { d1 = A->m; d2 = u->d2; - n_blocks = u->d2; + //n_blocks = u->d2; } else if (u->d2 == A->n && u->d1 == 1) { d1 = 1; d2 = A->m; - n_blocks = 1; + //n_blocks = 1; } else { @@ -177,16 +177,12 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) node->left = u; expr_retain(u); - /* store A */ + /* store A and AT */ lin_node->A = new_csr(A); lin_node->iwork2 = (int *) malloc((A->m * d2) * sizeof(int)); - - /* create kron(I, A)^T */ - int alloc = MAX(A->n * n_blocks, node->n_vars); + int alloc = MAX(A->n, node->n_vars); node->iwork = (int *) malloc(alloc * sizeof(int)); - CSR_Matrix *AT_temp = transpose(lin_node->A, node->iwork); - lin_node->A_kron_T = block_diag_repeat_csr(AT_temp, n_blocks); - free_csr_matrix(AT_temp); - + lin_node->AT = transpose(lin_node->A, node->iwork); + return node; } From 8259371ab49b0901e7b7fc56976cd2716b8e976e Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 13 Feb 2026 08:00:35 -0800 Subject: [PATCH 10/13] minor changes --- include/subexpr.h | 2 +- src/bivariate/left_matmul.c | 31 +++++++++++++++++++++---------- src/problem.c | 2 +- src/utils/CSC_Matrix.c | 4 ++++ 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/include/subexpr.h b/include/subexpr.h index b73688f..520f466 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -109,7 +109,7 @@ typedef struct left_matmul_expr expr base; CSR_Matrix *A; CSR_Matrix *AT; - CSR_Matrix *A_kron_T; + int n_blocks; CSC_Matrix *Jchild_CSC; CSC_Matrix *J_CSC; int *iwork2; /* workspace for csc to csr */ diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 2782627..34c2c50 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -61,7 +61,8 @@ static void forward(expr *node, const double *u) /* y = A_kron @ vec(f(x)) */ CSR_Matrix *A = ((left_matmul_expr *) node)->A; - block_left_multiply_vec(A, x->value, node->value, x->d2); + int n_blocks = ((left_matmul_expr *) node)->n_blocks; + block_left_multiply_vec(A, x->value, node->value, n_blocks); } static bool is_affine(const expr *node) @@ -86,6 +87,7 @@ static void free_type_data(expr *node) static void jacobian_init(expr *node) { + printf("Initializing jacobian for left_matmul... \n"); expr *x = node->left; left_matmul_expr *lin_node = (left_matmul_expr *) node; @@ -95,8 +97,9 @@ static void jacobian_init(expr *node) /* precompute sparsity of this node's jacobian in CSC and CSR */ lin_node->J_CSC = - block_left_multiply_fill_sparsity(lin_node->A, lin_node->Jchild_CSC, x->d2); + block_left_multiply_fill_sparsity(lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks); node->jacobian = csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->iwork2); + printf("Done initializing jacobian for left_matmul. \n"); } static void eval_jacobian(expr *node) @@ -118,6 +121,7 @@ static void eval_jacobian(expr *node) static void wsum_hess_init(expr *node) { + printf("Initializing wsum_hess for left_matmul... \n"); /* initialize child's hessian */ expr *x = node->left; x->wsum_hess_init(x); @@ -128,15 +132,18 @@ static void wsum_hess_init(expr *node) memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int)); /* work for computing A^T w*/ - int dim = ((left_matmul_expr *) node)->AT->m * node->d2; + int n_blocks = ((left_matmul_expr *) node)->n_blocks; + int dim = ((left_matmul_expr *) node)->A->n * n_blocks; node->dwork = (double *) malloc(dim * sizeof(double)); + printf("Done initializing wsum_hess for left_matmul. \n"); } static void eval_wsum_hess(expr *node, const double *w) { /* compute A^T w*/ CSR_Matrix *AT = ((left_matmul_expr *) node)->AT; - block_left_multiply_vec(AT, w, node->dwork, node->d2); + int n_blocks = ((left_matmul_expr *) node)->n_blocks; + block_left_multiply_vec(AT, w, node->dwork, n_blocks); node->left->eval_wsum_hess(node->left, node->dwork); memcpy(node->wsum_hess->x, node->left->wsum_hess->x, @@ -149,18 +156,18 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) to do A @ u where u is (n, ) which in C is actually (1, n). In that case the result of A @ u is (m, ), which is (1, m) according to broadcasting rules. We therefore check if this is the case. */ - int d1, d2; //n_blocks; + int d1, d2, n_blocks; if (u->d1 == A->n) { d1 = A->m; d2 = u->d2; - //n_blocks = u->d2; + n_blocks = u->d2; } else if (u->d2 == A->n && u->d1 == 1) { d1 = 1; d2 = A->m; - //n_blocks = 1; + n_blocks = 1; } else { @@ -177,11 +184,15 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) node->left = u; expr_retain(u); + /* allocate workspace. iwork is used for transposing A (requiring size A->n) + and for converting J_child csr to csc (requring size node->n_vars). + iwork2 is used for converting J_CSC to CSR (requring node->size) */ + node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int)); + lin_node->iwork2 = (int *) malloc(node->size * sizeof(int)); + lin_node->n_blocks = n_blocks; + /* store A and AT */ lin_node->A = new_csr(A); - lin_node->iwork2 = (int *) malloc((A->m * d2) * sizeof(int)); - int alloc = MAX(A->n, node->n_vars); - node->iwork = (int *) malloc(alloc * sizeof(int)); lin_node->AT = transpose(lin_node->A, node->iwork); return node; diff --git a/src/problem.c b/src/problem.c index d9c14a9..aae28bc 100644 --- a/src/problem.c +++ b/src/problem.c @@ -246,7 +246,7 @@ static inline void print_end_message(const Diff_engine_stats *stats) { printf("\n" "============================================================\n" - " DNLP Differentiation Engine v%s\n" + " SparseDifferentiation v%s\n" " (c) D. Cederberg and W. Zhang, Stanford University, 2026\n" "============================================================\n", DIFF_ENGINE_VERSION); diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c index 1e0d435..b6b72f7 100644 --- a/src/utils/CSC_Matrix.c +++ b/src/utils/CSC_Matrix.c @@ -19,6 +19,7 @@ #include "utils/iVec.h" #include #include +#include CSC_Matrix *new_csc_matrix(int m, int n, int nnz) { @@ -292,6 +293,7 @@ CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork) { for (j = A->p[i]; j < A->p[i + 1]; ++j) { + assert(A->i[j] < A->m); count[A->i[j]]++; } } @@ -313,6 +315,7 @@ CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork) { for (j = A->p[i]; j < A->p[i + 1]; ++j) { + assert(A->i[j] < A->m); C->i[count[A->i[j]]] = i; count[A->i[j]]++; } @@ -334,6 +337,7 @@ void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork) { for (j = A->p[i]; j < A->p[i + 1]; ++j) { + assert(A->i[j] < A->m); C->x[count[A->i[j]]] = A->x[j]; count[A->i[j]]++; } From 6b2ae18bb6ba7a2d427131039ed98df903320f55 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 13 Feb 2026 09:09:38 -0800 Subject: [PATCH 11/13] broadcast fix --- src/affine/broadcast.c | 23 ++++++++++--------- src/bivariate/left_matmul.c | 4 ---- tests/all_tests.c | 1 + tests/jacobian_tests/test_broadcast.h | 32 +++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/affine/broadcast.c b/src/affine/broadcast.c index 870e353..c16fe10 100644 --- a/src/affine/broadcast.c +++ b/src/affine/broadcast.c @@ -89,7 +89,7 @@ static void jacobian_init(expr *node) else { /* Scalar broadcast: (1, 1) -> (m, n) */ - total_nnz = x->jacobian->nnz * node->d1 * node->d2; + total_nnz = x->jacobian->nnz * node->size; } node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz); @@ -99,10 +99,10 @@ static void jacobian_init(expr *node) // --------------------------------------------------------------------- CSR_Matrix *Jx = x->jacobian; CSR_Matrix *J = node->jacobian; - J->nnz = 0; if (bcast->type == BROADCAST_ROW) { + J->nnz = 0; for (int i = 0; i < node->d2; i++) { int nnz_in_row = Jx->p[i + 1] - Jx->p[i]; @@ -117,11 +117,11 @@ static void jacobian_init(expr *node) J->nnz += nnz_in_row; } } + assert(J->nnz == total_nnz); J->p[node->size] = total_nnz; } else if (bcast->type == BROADCAST_COL) { - /* copy column indices */ tile_int(J->i, Jx->i, Jx->nnz, node->d2); @@ -129,10 +129,11 @@ static void jacobian_init(expr *node) int offset = 0; for (int i = 0; i < node->d2; i++) { + int nnz_in_row = Jx->p[i + 1] - Jx->p[i]; for (int j = 0; j < node->d1; j++) { J->p[i * node->d1 + j] = offset; - offset += Jx->p[1] - Jx->p[0]; + offset += nnz_in_row; } } assert(offset == total_nnz); @@ -141,12 +142,12 @@ static void jacobian_init(expr *node) else { /* copy column indices */ - tile_int(J->i, Jx->i, Jx->nnz, node->d1 * node->d2); + tile_int(J->i, Jx->i, Jx->nnz, node->size); /* set row pointers */ int offset = 0; int nnz = Jx->p[1] - Jx->p[0]; - for (int i = 0; i < node->d1 * node->d2; i++) + for (int i = 0; i < node->size; i++) { J->p[i] = offset; offset += nnz; @@ -163,10 +164,10 @@ static void eval_jacobian(expr *node) broadcast_expr *bcast = (broadcast_expr *) node; CSR_Matrix *Jx = node->left->jacobian; CSR_Matrix *J = node->jacobian; - J->nnz = 0; if (bcast->type == BROADCAST_ROW) { + J->nnz = 0; for (int i = 0; i < node->d2; i++) { int nnz_in_row = Jx->p[i + 1] - Jx->p[i]; @@ -180,7 +181,7 @@ static void eval_jacobian(expr *node) } else { - tile_double(J->x, Jx->x, Jx->nnz, node->d1 * node->d2); + tile_double(J->x, Jx->x, Jx->nnz, node->size); } } @@ -268,9 +269,9 @@ expr *new_broadcast(expr *child, int d1, int d2) } else { - fprintf( - stderr, - "ERROR: inconsistency of broadcasting between DNLP-diff and CVXPY. \n"); + fprintf(stderr, + "ERROR: inconsistency of broadcasting between SparseDifferentiation" + " and CVXPY. \n"); exit(1); } diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 34c2c50..7423da8 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -87,7 +87,6 @@ static void free_type_data(expr *node) static void jacobian_init(expr *node) { - printf("Initializing jacobian for left_matmul... \n"); expr *x = node->left; left_matmul_expr *lin_node = (left_matmul_expr *) node; @@ -99,7 +98,6 @@ static void jacobian_init(expr *node) lin_node->J_CSC = block_left_multiply_fill_sparsity(lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks); node->jacobian = csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->iwork2); - printf("Done initializing jacobian for left_matmul. \n"); } static void eval_jacobian(expr *node) @@ -121,7 +119,6 @@ static void eval_jacobian(expr *node) static void wsum_hess_init(expr *node) { - printf("Initializing wsum_hess for left_matmul... \n"); /* initialize child's hessian */ expr *x = node->left; x->wsum_hess_init(x); @@ -135,7 +132,6 @@ static void wsum_hess_init(expr *node) int n_blocks = ((left_matmul_expr *) node)->n_blocks; int dim = ((left_matmul_expr *) node)->A->n * n_blocks; node->dwork = (double *) malloc(dim * sizeof(double)); - printf("Done initializing wsum_hess for left_matmul. \n"); } static void eval_wsum_hess(expr *node, const double *w) diff --git a/tests/all_tests.c b/tests/all_tests.c index 8e050de..3b23e45 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -162,6 +162,7 @@ int main(void) mu_run_test(test_broadcast_row_jacobian, tests_run); mu_run_test(test_broadcast_col_jacobian, tests_run); mu_run_test(test_broadcast_scalar_to_matrix_jacobian, tests_run); + mu_run_test(test_double_broadcast, tests_run); mu_run_test(test_wsum_hess_multiply_1, tests_run); mu_run_test(test_wsum_hess_multiply_2, tests_run); mu_run_test(test_jacobian_trace_variable, tests_run); diff --git a/tests/jacobian_tests/test_broadcast.h b/tests/jacobian_tests/test_broadcast.h index 43872fe..4940b38 100644 --- a/tests/jacobian_tests/test_broadcast.h +++ b/tests/jacobian_tests/test_broadcast.h @@ -133,3 +133,35 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void) free_expr(bcast); return 0; } + +const char *test_double_broadcast(void) +{ + double x_vals[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + double b_vals[5] = {10.0, 20.0, 30.0, 40.0, 50.0}; + + /* form the expression x + b */ + expr *x = new_variable(5, 1, 0, 5); + expr *b = new_constant(1, 5, 5, b_vals); + expr *bcast_x = new_broadcast(x, 5, 5); + expr *bcast_b = new_broadcast(b, 5, 5); + expr *sum = new_add(bcast_x, bcast_b); + + sum->forward(sum, x_vals); + sum->jacobian_init(sum); + sum->eval_jacobian(sum); + + /* All 6 elements depend on the single input variable */ + //double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + //int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; + //int expected_i[6] = {0, 0, 0, 0, 0, 0}; +// + //mu_assert("broadcast scalar jacobian vals fail", + // cmp_double_array(sum->jacobian->x, expected_x, 6)); + //mu_assert("broadcast scalar jacobian rows fail", + // cmp_int_array(sum ->jacobian->p, expected_p, 7)); + //mu_assert("broadcast scalar jacobian cols fail", + // cmp_int_array(bcast->jacobian->i, expected_i, 6)); + + free_expr(sum); + return 0; +} From fa947bcf493b6738eb89ad5576ac73ae10adb239 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 13 Feb 2026 09:30:01 -0800 Subject: [PATCH 12/13] ran formatter --- include/subexpr.h | 4 ++-- src/bivariate/left_matmul.c | 20 +++++++++++--------- src/utils/CSC_Matrix.c | 2 +- tests/jacobian_tests/test_broadcast.h | 14 +++++++------- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/subexpr.h b/include/subexpr.h index 520f466..12bb1d2 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -109,10 +109,10 @@ typedef struct left_matmul_expr expr base; CSR_Matrix *A; CSR_Matrix *AT; - int n_blocks; + int n_blocks; CSC_Matrix *Jchild_CSC; CSC_Matrix *J_CSC; - int *iwork2; /* workspace for csc to csr */ + int *csc_to_csr_workspace; } left_matmul_expr; /* Right matrix multiplication: y = f(x) * A where f(x) is an expression. diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 7423da8..7b1baee 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -77,12 +77,12 @@ static void free_type_data(expr *node) free_csr_matrix(lin_node->AT); free_csc_matrix(lin_node->Jchild_CSC); free_csc_matrix(lin_node->J_CSC); - free(lin_node->iwork2); + free(lin_node->csc_to_csr_workspace); lin_node->A = NULL; lin_node->AT = NULL; lin_node->Jchild_CSC = NULL; lin_node->J_CSC = NULL; - lin_node->iwork2 = NULL; + lin_node->csc_to_csr_workspace = NULL; } static void jacobian_init(expr *node) @@ -95,9 +95,10 @@ static void jacobian_init(expr *node) lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); /* precompute sparsity of this node's jacobian in CSC and CSR */ - lin_node->J_CSC = - block_left_multiply_fill_sparsity(lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks); - node->jacobian = csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->iwork2); + lin_node->J_CSC = block_left_multiply_fill_sparsity( + lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks); + node->jacobian = + csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace); } static void eval_jacobian(expr *node) @@ -114,7 +115,7 @@ static void eval_jacobian(expr *node) /* compute this node's jacobian: */ block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC); - csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->iwork2); + csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace); } static void wsum_hess_init(expr *node) @@ -182,14 +183,15 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) /* allocate workspace. iwork is used for transposing A (requiring size A->n) and for converting J_child csr to csc (requring size node->n_vars). - iwork2 is used for converting J_CSC to CSR (requring node->size) */ + csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size) + */ node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int)); - lin_node->iwork2 = (int *) malloc(node->size * sizeof(int)); + lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int)); lin_node->n_blocks = n_blocks; /* store A and AT */ lin_node->A = new_csr(A); lin_node->AT = transpose(lin_node->A, node->iwork); - + return node; } diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c index b6b72f7..d017a62 100644 --- a/src/utils/CSC_Matrix.c +++ b/src/utils/CSC_Matrix.c @@ -17,9 +17,9 @@ */ #include "utils/CSC_Matrix.h" #include "utils/iVec.h" +#include #include #include -#include CSC_Matrix *new_csc_matrix(int m, int n, int nnz) { diff --git a/tests/jacobian_tests/test_broadcast.h b/tests/jacobian_tests/test_broadcast.h index 4940b38..612e5cf 100644 --- a/tests/jacobian_tests/test_broadcast.h +++ b/tests/jacobian_tests/test_broadcast.h @@ -151,15 +151,15 @@ const char *test_double_broadcast(void) sum->eval_jacobian(sum); /* All 6 elements depend on the single input variable */ - //double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - //int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; - //int expected_i[6] = {0, 0, 0, 0, 0, 0}; -// - //mu_assert("broadcast scalar jacobian vals fail", + // double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + // int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; + // int expected_i[6] = {0, 0, 0, 0, 0, 0}; + // + // mu_assert("broadcast scalar jacobian vals fail", // cmp_double_array(sum->jacobian->x, expected_x, 6)); - //mu_assert("broadcast scalar jacobian rows fail", + // mu_assert("broadcast scalar jacobian rows fail", // cmp_int_array(sum ->jacobian->p, expected_p, 7)); - //mu_assert("broadcast scalar jacobian cols fail", + // mu_assert("broadcast scalar jacobian cols fail", // cmp_int_array(bcast->jacobian->i, expected_i, 6)); free_expr(sum); From 0542a4d70e9830a4126dd7bc06b56d658c33d4f5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 13 Feb 2026 09:40:12 -0800 Subject: [PATCH 13/13] improved documentation of block_left_mult --- include/subexpr.h | 2 +- include/utils/linalg.h | 34 ++++++++++++++++++---------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/subexpr.h b/include/subexpr.h index 12bb1d2..b3f4170 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -103,7 +103,7 @@ typedef struct elementwise_mult_expr /* Left matrix multiplication: y = A * f(x) where f(x) is an expression. Note that here A does not have global column indices but it is a local matrix. This is an -important distinction compared to ear_op_expr. */ +important distinction compared to linear_op_expr. */ typedef struct left_matmul_expr { expr base; diff --git a/include/utils/linalg.h b/include/utils/linalg.h index ec7419f..25890a9 100644 --- a/include/utils/linalg.h +++ b/include/utils/linalg.h @@ -5,22 +5,32 @@ struct CSR_Matrix; struct CSC_Matrix; -/* Compute sparsity pattern for block left multiplication. - * C = [A @ J1; A @ J2; ...; A @ Jp] where A is m x n and J is (n*p) x k. - * J is provided in CSC format and is split into p blocks of n rows each. - * Result C is returned as CSC format with dimensions (m*p) x k. - */ +/* Compute sparsity pattern and values for the matrix-matrix multiplication + C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k, + without relying on generic sparse matrix-matrix multiplication. Specialized + logic for this is much faster (50-100x) than generic sparse matmul. + + * J is provided in CSC format and is split into p blocks of n rows each + * C is returned in CSC format + * Mathematically it corresponds to C = [A @ J1; A @ J2; ...; A @ Jp], + where J = [J1; J2; ...; Jp] +*/ struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A, const struct CSC_Matrix *J, int p); -/* Fill values for block left multiplication. - * C must have sparsity pattern already computed for [A @ J1; ...; A @ Jp]. - */ void block_left_multiply_fill_values(const struct CSR_Matrix *A, const struct CSC_Matrix *J, struct CSC_Matrix *C); +/* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector. + The output y is m*p-length vector corresponding to + y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n + elements. +*/ +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, + int p); + /* Fill values of C = A @ B where A is CSR, B is CSC. * C must have sparsity pattern already computed. */ @@ -33,12 +43,4 @@ void csr_csc_matmul_fill_values(const struct CSR_Matrix *A, struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A, const struct CSC_Matrix *B); -/* Compute block-wise matrix-vector products. - * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is divided into p blocks - * of n elements. A is provided in CSR format. x is (n*p)-length vector, y is - * (m*p)-length output. - */ -void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, - int p); - #endif /* LINALG_H */