diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ce9857..e9c4d9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,8 @@ set_property(TARGET dnlp_diff PROPERTY POSITION_INDEPENDENT_CODE ON) # ============================================================================= # C tests (only for standalone builds) # ============================================================================= +option(PROFILE_ONLY "Build only profiling tests" OFF) + if(NOT SKBUILD) include_directories(${PROJECT_SOURCE_DIR}/tests) enable_testing() @@ -90,5 +92,11 @@ if(NOT SKBUILD) tests/test_helpers.c ) target_link_libraries(all_tests dnlp_diff) + + if(PROFILE_ONLY) + target_compile_definitions(all_tests PRIVATE PROFILE_ONLY) + message(STATUS "Building ONLY profiling tests") + endif() + add_test(NAME AllTests COMMAND all_tests) endif() diff --git a/include/subexpr.h b/include/subexpr.h index a663282..b3f4170 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -109,7 +109,10 @@ typedef struct left_matmul_expr expr base; CSR_Matrix *A; CSR_Matrix *AT; - CSC_Matrix *CSC_work; + int n_blocks; + CSC_Matrix *Jchild_CSC; + CSC_Matrix *J_CSC; + int *csc_to_csr_workspace; } left_matmul_expr; /* Right matrix multiplication: y = f(x) * A where f(x) is an expression. diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_Matrix.h index ed044dc..c444dcc 100644 --- a/include/utils/CSC_Matrix.h +++ b/include/utils/CSC_Matrix.h @@ -52,15 +52,7 @@ void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C) CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork); void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork); -/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC - * Precomputes sparsity pattern. No workspace required. - */ -CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B); - -/* Fill values of C = A @ B where A is CSR, B is CSC - * C must have sparsity pattern already computed - */ -void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, - CSR_Matrix *C); +CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork); +void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork); #endif /* CSC_MATRIX_H */ diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_Matrix.h index 2e98abf..8c3f523 100644 --- a/include/utils/CSR_Matrix.h +++ b/include/utils/CSR_Matrix.h @@ -27,6 +27,7 @@ typedef struct CSR_Matrix /* Allocate a new CSR matrix with given dimensions and nnz */ CSR_Matrix *new_csr_matrix(int m, int n, int nnz); +CSR_Matrix *new_csr(const CSR_Matrix *A); /* Free a CSR matrix */ void free_csr_matrix(CSR_Matrix *matrix); diff --git a/include/utils/linalg.h b/include/utils/linalg.h new file mode 100644 index 0000000..25890a9 --- /dev/null +++ b/include/utils/linalg.h @@ -0,0 +1,46 @@ +#ifndef LINALG_H +#define LINALG_H + +/* Forward declarations */ +struct CSR_Matrix; +struct CSC_Matrix; + +/* Compute sparsity pattern and values for the matrix-matrix multiplication + C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k, + without relying on generic sparse matrix-matrix multiplication. Specialized + logic for this is much faster (50-100x) than generic sparse matmul. + + * J is provided in CSC format and is split into p blocks of n rows each + * C is returned in CSC format + * Mathematically it corresponds to C = [A @ J1; A @ J2; ...; A @ Jp], + where J = [J1; J2; ...; Jp] +*/ +struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A, + const struct CSC_Matrix *J, + int p); + +void block_left_multiply_fill_values(const struct CSR_Matrix *A, + const struct CSC_Matrix *J, + struct CSC_Matrix *C); + +/* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector. + The output y is m*p-length vector corresponding to + y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n + elements. +*/ +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, + int p); + +/* Fill values of C = A @ B where A is CSR, B is CSC. + * C must have sparsity pattern already computed. + */ +void csr_csc_matmul_fill_values(const struct CSR_Matrix *A, + const struct CSC_Matrix *B, struct CSR_Matrix *C); + +/* C = A @ B where A is CSR, B is CSC. Result C is CSR. + * Allocates and precomputes sparsity pattern. No workspace required. + */ +struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A, + const struct CSC_Matrix *B); + +#endif /* LINALG_H */ diff --git a/src/affine/broadcast.c b/src/affine/broadcast.c index 870e353..c16fe10 100644 --- a/src/affine/broadcast.c +++ b/src/affine/broadcast.c @@ -89,7 +89,7 @@ static void jacobian_init(expr *node) else { /* Scalar broadcast: (1, 1) -> (m, n) */ - total_nnz = x->jacobian->nnz * node->d1 * node->d2; + total_nnz = x->jacobian->nnz * node->size; } node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz); @@ -99,10 +99,10 @@ static void jacobian_init(expr *node) // --------------------------------------------------------------------- CSR_Matrix *Jx = x->jacobian; CSR_Matrix *J = node->jacobian; - J->nnz = 0; if (bcast->type == BROADCAST_ROW) { + J->nnz = 0; for (int i = 0; i < node->d2; i++) { int nnz_in_row = Jx->p[i + 1] - Jx->p[i]; @@ -117,11 +117,11 @@ static void jacobian_init(expr *node) J->nnz += nnz_in_row; } } + assert(J->nnz == total_nnz); J->p[node->size] = total_nnz; } else if (bcast->type == BROADCAST_COL) { - /* copy column indices */ tile_int(J->i, Jx->i, Jx->nnz, node->d2); @@ -129,10 +129,11 @@ static void jacobian_init(expr *node) int offset = 0; for (int i = 0; i < node->d2; i++) { + int nnz_in_row = Jx->p[i + 1] - Jx->p[i]; for (int j = 0; j < node->d1; j++) { J->p[i * node->d1 + j] = offset; - offset += Jx->p[1] - Jx->p[0]; + offset += nnz_in_row; } } assert(offset == total_nnz); @@ -141,12 +142,12 @@ static void jacobian_init(expr *node) else { /* copy column indices */ - tile_int(J->i, Jx->i, Jx->nnz, node->d1 * node->d2); + tile_int(J->i, Jx->i, Jx->nnz, node->size); /* set row pointers */ int offset = 0; int nnz = Jx->p[1] - Jx->p[0]; - for (int i = 0; i < node->d1 * node->d2; i++) + for (int i = 0; i < node->size; i++) { J->p[i] = offset; offset += nnz; @@ -163,10 +164,10 @@ static void eval_jacobian(expr *node) broadcast_expr *bcast = (broadcast_expr *) node; CSR_Matrix *Jx = node->left->jacobian; CSR_Matrix *J = node->jacobian; - J->nnz = 0; if (bcast->type == BROADCAST_ROW) { + J->nnz = 0; for (int i = 0; i < node->d2; i++) { int nnz_in_row = Jx->p[i + 1] - Jx->p[i]; @@ -180,7 +181,7 @@ static void eval_jacobian(expr *node) } else { - tile_double(J->x, Jx->x, Jx->nnz, node->d1 * node->d2); + tile_double(J->x, Jx->x, Jx->nnz, node->size); } } @@ -268,9 +269,9 @@ expr *new_broadcast(expr *child, int d1, int d2) } else { - fprintf( - stderr, - "ERROR: inconsistency of broadcasting between DNLP-diff and CVXPY. \n"); + fprintf(stderr, + "ERROR: inconsistency of broadcasting between SparseDifferentiation" + " and CVXPY. \n"); exit(1); } diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c index 92dd762..7b1baee 100644 --- a/src/bivariate/left_matmul.c +++ b/src/bivariate/left_matmul.c @@ -17,6 +17,8 @@ */ #include "bivariate.h" #include "subexpr.h" +#include "utils/Timer.h" +#include "utils/linalg.h" #include #include #include @@ -31,7 +33,9 @@ * To compute the forward pass: vec(y) = A_kron @ vec(f(x)), where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p), or more specificely, a block-diagonal matrix with p blocks of A along the - diagonal. + diagonal. In the refactored implementation we don't form A_kron explicitly, + only conceptually. This led to a 100x speedup in the initialization of the + Jacobian sparsity pattern. * To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the Jacobian of f(x) of size (n*p) x n_vars. @@ -42,7 +46,8 @@ Working in terms of A_kron unifies the implementation of f(x) being vector-valued or matrix-valued. - + I (dance858) think we can get additional big speedups when A is dense by + introducing a dense matrix class. */ #include "utils/utils.h" @@ -55,7 +60,9 @@ static void forward(expr *node, const double *u) node->left->forward(node->left, u); /* y = A_kron @ vec(f(x)) */ - csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value); + CSR_Matrix *A = ((left_matmul_expr *) node)->A; + int n_blocks = ((left_matmul_expr *) node)->n_blocks; + block_left_multiply_vec(A, x->value, node->value, n_blocks); } static bool is_affine(const expr *node) @@ -68,13 +75,14 @@ static void free_type_data(expr *node) left_matmul_expr *lin_node = (left_matmul_expr *) node; free_csr_matrix(lin_node->A); free_csr_matrix(lin_node->AT); - if (lin_node->CSC_work) - { - free_csc_matrix(lin_node->CSC_work); - } + free_csc_matrix(lin_node->Jchild_CSC); + free_csc_matrix(lin_node->J_CSC); + free(lin_node->csc_to_csr_workspace); lin_node->A = NULL; lin_node->AT = NULL; - lin_node->CSC_work = NULL; + lin_node->Jchild_CSC = NULL; + lin_node->J_CSC = NULL; + lin_node->csc_to_csr_workspace = NULL; } static void jacobian_init(expr *node) @@ -82,25 +90,32 @@ static void jacobian_init(expr *node) expr *x = node->left; left_matmul_expr *lin_node = (left_matmul_expr *) node; - /* initialize child's jacobian and precompute sparsity of its transpose */ + /* initialize child's jacobian and precompute sparsity of its CSC */ x->jacobian_init(x); - lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); + lin_node->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); - /* precompute sparsity of this node's jacobian */ - node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work); + /* precompute sparsity of this node's jacobian in CSC and CSR */ + lin_node->J_CSC = block_left_multiply_fill_sparsity( + lin_node->A, lin_node->Jchild_CSC, lin_node->n_blocks); + node->jacobian = + csc_to_csr_fill_sparsity(lin_node->J_CSC, lin_node->csc_to_csr_workspace); } static void eval_jacobian(expr *node) { expr *x = node->left; - left_matmul_expr *lin_node = (left_matmul_expr *) node; + left_matmul_expr *lnode = (left_matmul_expr *) node; + + CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC; + CSC_Matrix *J_CSC = lnode->J_CSC; /* evaluate child's jacobian and convert to CSC */ x->eval_jacobian(x); - csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork); + csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->iwork); - /* compute this node's jacobian */ - csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian); + /* compute this node's jacobian: */ + block_left_multiply_fill_values(lnode->A, Jchild_CSC, J_CSC); + csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_workspace); } static void wsum_hess_init(expr *node) @@ -115,15 +130,17 @@ static void wsum_hess_init(expr *node) memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int)); /* work for computing A^T w*/ - int A_n = ((left_matmul_expr *) node)->A->n; - node->dwork = (double *) malloc(A_n * sizeof(double)); + int n_blocks = ((left_matmul_expr *) node)->n_blocks; + int dim = ((left_matmul_expr *) node)->A->n * n_blocks; + node->dwork = (double *) malloc(dim * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) { /* compute A^T w*/ - left_matmul_expr *lin_node = (left_matmul_expr *) node; - csr_matvec_wo_offset(lin_node->AT, w, node->dwork); + CSR_Matrix *AT = ((left_matmul_expr *) node)->AT; + int n_blocks = ((left_matmul_expr *) node)->n_blocks; + block_left_multiply_vec(AT, w, node->dwork, n_blocks); node->left->eval_wsum_hess(node->left, node->dwork); memcpy(node->wsum_hess->x, node->left->wsum_hess->x, @@ -132,10 +149,10 @@ static void eval_wsum_hess(expr *node, const double *w) expr *new_left_matmul(expr *u, const CSR_Matrix *A) { - /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users to do - A @ u where u is (n, ) which in C is actually (1, n). In that case the result - of A @ u is (m, ), which is (1, m) according to broadcasting rules. We - therefore check if this is the case. */ + /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users + to do A @ u where u is (n, ) which in C is actually (1, n). In that case + the result of A @ u is (m, ), which is (1, m) according to broadcasting + rules. We therefore check if this is the case. */ int d1, d2, n_blocks; if (u->d1 == A->n) { @@ -164,12 +181,17 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A) node->left = u; expr_retain(u); - /* Initialize type-specific fields */ - lin_node->A = block_diag_repeat_csr(A, n_blocks); - int alloc = MAX(lin_node->A->n, node->n_vars); - node->iwork = (int *) malloc(alloc * sizeof(int)); + /* allocate workspace. iwork is used for transposing A (requiring size A->n) + and for converting J_child csr to csc (requring size node->n_vars). + csc_to_csr_workspace is used for converting J_CSC to CSR (requring node->size) + */ + node->iwork = (int *) malloc(MAX(A->n, node->n_vars) * sizeof(int)); + lin_node->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int)); + lin_node->n_blocks = n_blocks; + + /* store A and AT */ + lin_node->A = new_csr(A); lin_node->AT = transpose(lin_node->A, node->iwork); - lin_node->CSC_work = NULL; return node; } diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c index 98c08d0..81cb317 100644 --- a/src/bivariate/right_matmul.c +++ b/src/bivariate/right_matmul.c @@ -17,6 +17,7 @@ */ #include "bivariate.h" #include "subexpr.h" +#include "utils/linalg.h" #include /* This file implements the atom 'right_matmul' corresponding to the operation y = diff --git a/src/problem.c b/src/problem.c index d9c14a9..aae28bc 100644 --- a/src/problem.c +++ b/src/problem.c @@ -246,7 +246,7 @@ static inline void print_end_message(const Diff_engine_stats *stats) { printf("\n" "============================================================\n" - " DNLP Differentiation Engine v%s\n" + " SparseDifferentiation v%s\n" " (c) D. Cederberg and W. Zhang, Stanford University, 2026\n" "============================================================\n", DIFF_ENGINE_VERSION); diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c index 2943599..d017a62 100644 --- a/src/utils/CSC_Matrix.c +++ b/src/utils/CSC_Matrix.c @@ -17,6 +17,7 @@ */ #include "utils/CSC_Matrix.h" #include "utils/iVec.h" +#include #include #include @@ -139,35 +140,6 @@ static inline double sparse_wdot(const double *a_x, const int *a_i, int a_nnz, return sum; } -/* Unweighted sparse dot product of two sorted index arrays */ -static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz, - const double *b_x, const int *b_i, int b_nnz) -{ - int ii = 0; - int jj = 0; - double sum = 0.0; - - while (ii < a_nnz && jj < b_nnz) - { - if (a_i[ii] == b_i[jj]) - { - sum += a_x[ii] * b_x[jj]; - ii++; - jj++; - } - else if (a_i[ii] < b_i[jj]) - { - ii++; - } - else - { - jj++; - } - } - - return sum; -} - void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C) { int j, ii, jj; @@ -306,6 +278,72 @@ void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork) } } +CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork) +{ + CSR_Matrix *C = new_csr_matrix(A->m, A->n, A->nnz); + + int i, j; + int *count = iwork; + memset(count, 0, A->m * sizeof(int)); + + // ------------------------------------------------------------------- + // compute nnz in each row of A, store in count + // ------------------------------------------------------------------- + for (i = 0; i < A->n; ++i) + { + for (j = A->p[i]; j < A->p[i + 1]; ++j) + { + assert(A->i[j] < A->m); + count[A->i[j]]++; + } + } + + // ------------------------------------------------------------------ + // compute row pointers + // ------------------------------------------------------------------ + C->p[0] = 0; + for (i = 0; i < A->m; ++i) + { + C->p[i + 1] = C->p[i] + count[i]; + count[i] = C->p[i]; + } + + // ------------------------------------------------------------------ + // fill column indices + // ------------------------------------------------------------------ + for (i = 0; i < A->n; ++i) + { + for (j = A->p[i]; j < A->p[i + 1]; ++j) + { + assert(A->i[j] < A->m); + C->i[count[A->i[j]]] = i; + count[A->i[j]]++; + } + } + + return C; +} + +void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork) +{ + int i, j; + int *count = iwork; + memcpy(count, C->p, A->m * sizeof(int)); + + // ------------------------------------------------------------------ + // fill values + // ------------------------------------------------------------------ + for (i = 0; i < A->n; ++i) + { + for (j = A->p[i]; j < A->p[i + 1]; ++j) + { + assert(A->i[j] < A->m); + C->x[count[A->i[j]]] = A->x[j]; + count[A->i[j]]++; + } + } +} + CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B) { /* A is m x n, B is m x p, C = B^T A is p x n */ @@ -413,88 +451,3 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d, } } } - -static bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len) -{ - int ai = 0, bi = 0; - while (ai < a_len && bi < b_len) - { - if (a_idx[ai] == b_idx[bi]) return true; - if (a_idx[ai] < b_idx[bi]) - { - ai++; - } - else - { - bi++; - } - } - return false; -} - -/* Fill values of C = A @ B where A is CSR, B is CSC. */ -void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, - CSR_Matrix *C) -{ - for (int i = 0; i < A->m; i++) - { - for (int jj = C->p[i]; jj < C->p[i + 1]; jj++) - { - int j = C->i[jj]; - - int a_nnz = A->p[i + 1] - A->p[i]; - int b_nnz = B->p[j + 1] - B->p[j]; - - /* Compute dot product of row i of A and column j of B */ - double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz, - B->x + B->p[j], B->i + B->p[j], b_nnz); - - C->x[jj] = sum; - } - } -} - -/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p) - with precomputed sparsity pattern */ -CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) -{ - int m = A->m; - int p = B->n; - - int len_a, len_b; - - int *Cp = (int *) malloc((m + 1) * sizeof(int)); - iVec *Ci = iVec_new(m); - - Cp[0] = 0; - - // -------------------------------------------------------------- - // countz nnz and fill column indices - // -------------------------------------------------------------- - int nnz = 0; - for (int i = 0; i < A->m; i++) - { - len_a = A->p[i + 1] - A->p[i]; - - for (int j = 0; j < B->n; j++) - { - len_b = B->p[j + 1] - B->p[j]; - - if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b)) - { - iVec_append(Ci, j); - nnz++; - } - } - - Cp[i + 1] = nnz; - } - - CSR_Matrix *C = new_csr_matrix(m, p, nnz); - memcpy(C->p, Cp, (m + 1) * sizeof(int)); - memcpy(C->i, Ci->data, nnz * sizeof(int)); - free(Cp); - iVec_free(Ci); - - return C; -} diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_Matrix.c index 8b0635b..fe6688a 100644 --- a/src/utils/CSR_Matrix.c +++ b/src/utils/CSR_Matrix.c @@ -36,6 +36,15 @@ CSR_Matrix *new_csr_matrix(int m, int n, int nnz) return matrix; } +CSR_Matrix *new_csr(const CSR_Matrix *A) +{ + CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz); + memcpy(copy->p, A->p, (A->m + 1) * sizeof(int)); + memcpy(copy->i, A->i, A->nnz * sizeof(int)); + memcpy(copy->x, A->x, A->nnz * sizeof(double)); + return copy; +} + void free_csr_matrix(CSR_Matrix *matrix) { if (matrix) diff --git a/src/utils/linalg.c b/src/utils/linalg.c new file mode 100644 index 0000000..82c03e0 --- /dev/null +++ b/src/utils/linalg.c @@ -0,0 +1,350 @@ +/* + * Copyright 2026 Daniel Cederberg and William Zhang + * + * This file is part of the DNLP-differentiation-engine project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "utils/CSC_Matrix.h" +#include "utils/CSR_Matrix.h" +#include "utils/iVec.h" +#include +#include +#include +#include +#include + +static inline bool has_overlap(const int *a_idx, int a_len, const int *b_idx, + int b_len, int b_offset) +{ + int ai = 0, bi = 0; + while (ai < a_len && bi < b_len) + { + if (a_idx[ai] == b_idx[bi] - b_offset) return true; + if (a_idx[ai] < b_idx[bi] - b_offset) + { + ai++; + } + else + { + bi++; + } + } + return false; +} + +/* Unweighted sparse dot product of two sorted index arrays */ +static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz, + const double *b_x, const int *b_i, int b_nnz, + int b_offset) +{ + int ii = 0; + int jj = 0; + double sum = 0.0; + + while (ii < a_nnz && jj < b_nnz) + { + if (a_i[ii] == b_i[jj] - b_offset) + { + sum += a_x[ii] * b_x[jj]; + ii++; + jj++; + } + else if (a_i[ii] < b_i[jj] - b_offset) + { + ii++; + } + else + { + jj++; + } + } + + return sum; +} + +static inline double sparse_dot_offset(const double *a_x, const int *a_idx, + int a_nnz, const double *b_x, + const int *b_idx, int b_nnz, int b_offset) +{ + int ii = 0, jj = 0; + double sum = 0.0; + + while (ii < a_nnz && jj < b_nnz) + { + int b_col = b_idx[jj] - b_offset; + + if (a_idx[ii] == b_col) + { + sum += a_x[ii] * b_x[jj]; + ii++; + jj++; + } + else if (a_idx[ii] < b_col) + { + ii++; + } + else + { + jj++; + } + } + + return sum; +} + +CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A, + const CSC_Matrix *J, int p) +{ + /* A is m x n, J is (n*p) x k, C is (m*p) x k */ + int m = A->m; + int n = A->n; + int k = J->n; + int j, jj, block, block_start, block_end, block_jj_start, block_jj_end, + row_offset; + + /* allocate column pointers and an estimate of row indices */ + int *Cp = (int *) malloc((k + 1) * sizeof(int)); + iVec *Ci = iVec_new(k * m); + Cp[0] = 0; + + /* for each column of j */ + for (j = 0; j < J->n; j++) + { + /* if empty we continue */ + if (J->p[j] == J->p[j + 1]) + { + Cp[j + 1] = Cp[j]; + continue; + } + + /* process each of p blocks of rows in this column of J */ + jj = J->p[j]; + for (block = 0; block < p; block++) + { + + // ----------------------------------------------------------------- + // find start and end indices of rows of J in this block + // ----------------------------------------------------------------- + block_start = block * n; + block_end = block_start + n; + while (jj < J->p[j + 1] && J->i[jj] < block_start) + { + jj++; + } + + block_jj_start = jj; + + while (jj < J->p[j + 1] && J->i[jj] < block_end) + { + jj++; + } + + block_jj_end = jj; + int nnz_in_block = block_jj_end - block_jj_start; + if (nnz_in_block == 0) + { + continue; + } + + // ----------------------------------------------------------------- + // check which rows of A overlap with the column indices of J in this + // block + // ----------------------------------------------------------------- + row_offset = block * m; + for (int i = 0; i < A->m; i++) + { + int a_len = A->p[i + 1] - A->p[i]; + if (has_overlap(A->i + A->p[i], a_len, J->i + block_jj_start, + nnz_in_block, block_start)) + { + iVec_append(Ci, row_offset + i); + } + } + } + Cp[j + 1] = Ci->len; + } + + /* Allocate result matrix C in CSC format */ + CSC_Matrix *C = new_csc_matrix(m * p, k, Ci->len); + + /* Copy column pointers and row indices */ + memcpy(C->p, Cp, (k + 1) * sizeof(int)); + memcpy(C->i, Ci->data, Ci->len * sizeof(int)); + + /* Clean up workspace */ + free(Cp); + iVec_free(Ci); + + return C; +} + +void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J, + CSC_Matrix *C) +{ + /* A is m x n, J is (n*p) x k, C is (m*p) x k */ + int m = A->m; + int n = A->n; + int k = J->n; + + int i, j, row_a, block, block_start, block_end, start, end; + + /* to get rid of unitialized warnings */ + block = 0; + block_start = 0; + block_end = 0; + start = 0; + end = 0; + + /* for each column of J (and C) */ + for (j = 0; j < k; j++) + { + int previous_block = -1; + + for (i = C->p[j]; i < C->p[j + 1]; i++) + { + /* choose row of A and block of column of J */ + row_a = C->i[i] % m; + block = C->i[i] / m; + + // ------------------------------------------------------------------------- + // find the part of the column of J in the current block + // ------------------------------------------------------------------------- + if (block != previous_block) + { + previous_block = block; + block_start = block * n; + block_end = block_start + n; + start = J->p[j]; + end = J->p[j + 1]; + + while (start < J->p[j + 1] && J->i[start] < block_start) + { + start++; + } + + while (end > start && J->i[end - 1] >= block_end) + { + end--; + } + } + + // ------------------------------------------------------------------------------ + // compute value as sparse dot product of row of A and column of J in + // this block + // ------------------------------------------------------------------------------ + int a_len = A->p[row_a + 1] - A->p[row_a]; + C->x[i] = + sparse_dot(A->x + A->p[row_a], A->i + A->p[row_a], a_len, + J->x + start, J->i + start, end - start, block_start); + } + } +} + +/* Fill values of C = A @ B where A is CSR, B is CSC. */ +void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B, + CSR_Matrix *C) +{ + for (int i = 0; i < A->m; i++) + { + for (int jj = C->p[i]; jj < C->p[i + 1]; jj++) + { + int j = C->i[jj]; + + int a_nnz = A->p[i + 1] - A->p[i]; + int b_nnz = B->p[j + 1] - B->p[j]; + + /* Compute dot product of row i of A and column j of B */ + double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz, + B->x + B->p[j], B->i + B->p[j], b_nnz, 0); + + C->x[jj] = sum; + } + } +} + +/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p) + with precomputed sparsity pattern */ +CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B) +{ + int m = A->m; + int p = B->n; + + int len_a, len_b; + + int *Cp = (int *) malloc((m + 1) * sizeof(int)); + iVec *Ci = iVec_new(m); + + Cp[0] = 0; + + // -------------------------------------------------------------- + // count nnz and fill column indices + // -------------------------------------------------------------- + int nnz = 0; + for (int i = 0; i < A->m; i++) + { + len_a = A->p[i + 1] - A->p[i]; + + for (int j = 0; j < B->n; j++) + { + len_b = B->p[j + 1] - B->p[j]; + + if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b, 0)) + { + iVec_append(Ci, j); + nnz++; + } + } + + Cp[i + 1] = nnz; + } + + CSR_Matrix *C = new_csr_matrix(m, p, nnz); + memcpy(C->p, Cp, (m + 1) * sizeof(int)); + memcpy(C->i, Ci->data, nnz * sizeof(int)); + free(Cp); + iVec_free(Ci); + + return C; +} + +/* Compute block-wise matrix-vector products. + * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is (n*p)-length vector. + * x is split into p blocks of n elements each. + */ +void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y, + int p) +{ + /* For each block */ + for (int block = 0; block < p; block++) + { + int block_start = block * A->n; + int y_offset = block * A->m; + + /* For each row of A */ + for (int i = 0; i < A->m; i++) + { + double row_sum = 0.0; + int row_nnz = A->p[i + 1] - A->p[i]; + + /* Compute sparse dot product of A[i,:] with x_block */ + for (int idx = 0; idx < row_nnz; idx++) + { + int col = A->i[A->p[i] + idx]; + row_sum += A->x[A->p[i] + idx] * x[block_start + col]; + } + + y[y_offset + i] = row_sum; + } + } +} diff --git a/tests/all_tests.c b/tests/all_tests.c index 4e7571a..3b23e45 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -3,6 +3,7 @@ #include "minunit.h" /* Include all test headers */ +#ifndef PROFILE_ONLY #include "forward_pass/affine/test_add.h" #include "forward_pass/affine/test_broadcast.h" #include "forward_pass/affine/test_hstack.h" @@ -43,7 +44,9 @@ #include "jacobian_tests/test_transpose.h" #include "problem/test_problem.h" #include "utils/test_csc_matrix.h" +#include "utils/test_csr_csc_conversion.h" #include "utils/test_csr_matrix.h" +#include "utils/test_linalg.h" #include "wsum_hess/elementwise/test_entr.h" #include "wsum_hess/elementwise/test_exp.h" #include "wsum_hess/elementwise/test_hyperbolic.h" @@ -72,6 +75,11 @@ #include "wsum_hess/test_sum.h" #include "wsum_hess/test_trace.h" #include "wsum_hess/test_transpose.h" +#endif /* PROFILE_ONLY */ + +#ifdef PROFILE_ONLY +#include "profiling/profile_left_matmul.h" +#endif /* PROFILE_ONLY */ int main(void) { @@ -79,6 +87,7 @@ int main(void) int tests_run = 0; +#ifndef PROFILE_ONLY printf("--- Forward Pass Tests ---\n"); mu_run_test(test_variable, tests_run); mu_run_test(test_constant, tests_run); @@ -153,6 +162,7 @@ int main(void) mu_run_test(test_broadcast_row_jacobian, tests_run); mu_run_test(test_broadcast_col_jacobian, tests_run); mu_run_test(test_broadcast_scalar_to_matrix_jacobian, tests_run); + mu_run_test(test_double_broadcast, tests_run); mu_run_test(test_wsum_hess_multiply_1, tests_run); mu_run_test(test_wsum_hess_multiply_2, tests_run); mu_run_test(test_jacobian_trace_variable, tests_run); @@ -239,6 +249,19 @@ int main(void) mu_run_test(test_kron_identity_csr, tests_run); mu_run_test(test_csr_to_csc1, tests_run); mu_run_test(test_csr_to_csc2, tests_run); + mu_run_test(test_csr_to_csc_split, tests_run); + mu_run_test(test_csc_to_csr_sparsity, tests_run); + mu_run_test(test_csc_to_csr_values, tests_run); + mu_run_test(test_csr_csc_csr_roundtrip, tests_run); + mu_run_test(test_block_left_multiply_single_block, tests_run); + mu_run_test(test_block_left_multiply_two_blocks, tests_run); + mu_run_test(test_block_left_multiply_zero_column, tests_run); + mu_run_test(test_csr_csc_matmul_alloc_basic, tests_run); + mu_run_test(test_csr_csc_matmul_alloc_sparse, tests_run); + mu_run_test(test_block_left_multiply_vec_single_block, tests_run); + mu_run_test(test_block_left_multiply_vec_two_blocks, tests_run); + mu_run_test(test_block_left_multiply_vec_sparse, tests_run); + mu_run_test(test_block_left_multiply_vec_three_blocks, tests_run); mu_run_test(test_csr_vecmat_values_sparse, tests_run); mu_run_test(test_sum_all_rows_csr, tests_run); mu_run_test(test_sum_block_of_rows_csr, tests_run); @@ -257,6 +280,12 @@ int main(void) mu_run_test(test_problem_jacobian_multi, tests_run); mu_run_test(test_problem_constraint_forward, tests_run); mu_run_test(test_problem_hessian, tests_run); +#endif /* PROFILE_ONLY */ + +#ifdef PROFILE_ONLY + printf("\n--- Profiling Tests ---\n"); + mu_run_test(profile_left_matmul, tests_run); +#endif /* PROFILE_ONLY */ printf("\n=== All %d tests passed ===\n", tests_run); diff --git a/tests/jacobian_tests/test_broadcast.h b/tests/jacobian_tests/test_broadcast.h index 43872fe..612e5cf 100644 --- a/tests/jacobian_tests/test_broadcast.h +++ b/tests/jacobian_tests/test_broadcast.h @@ -133,3 +133,35 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void) free_expr(bcast); return 0; } + +const char *test_double_broadcast(void) +{ + double x_vals[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + double b_vals[5] = {10.0, 20.0, 30.0, 40.0, 50.0}; + + /* form the expression x + b */ + expr *x = new_variable(5, 1, 0, 5); + expr *b = new_constant(1, 5, 5, b_vals); + expr *bcast_x = new_broadcast(x, 5, 5); + expr *bcast_b = new_broadcast(b, 5, 5); + expr *sum = new_add(bcast_x, bcast_b); + + sum->forward(sum, x_vals); + sum->jacobian_init(sum); + sum->eval_jacobian(sum); + + /* All 6 elements depend on the single input variable */ + // double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + // int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; + // int expected_i[6] = {0, 0, 0, 0, 0, 0}; + // + // mu_assert("broadcast scalar jacobian vals fail", + // cmp_double_array(sum->jacobian->x, expected_x, 6)); + // mu_assert("broadcast scalar jacobian rows fail", + // cmp_int_array(sum ->jacobian->p, expected_p, 7)); + // mu_assert("broadcast scalar jacobian cols fail", + // cmp_int_array(bcast->jacobian->i, expected_i, 6)); + + free_expr(sum); + return 0; +} diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h new file mode 100644 index 0000000..a8e819b --- /dev/null +++ b/tests/profiling/profile_left_matmul.h @@ -0,0 +1,62 @@ +#include +#include +#include +#include + +#include "affine.h" +#include "bivariate.h" +#include "elementwise_univariate.h" +#include "expr.h" +#include "minunit.h" +#include "test_helpers.h" +#include "utils/Timer.h" + +const char *profile_left_matmul() +{ + /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */ + int n = 100; + expr *X = new_variable(n, n, 0, n * n); + CSR_Matrix *A = new_csr_matrix(n, n, n * n); + for (int i = 0; i < n * n; i++) + { + A->x[i] = 1.0; /* dense matrix of all ones */ + } + for (int row = 0; row < n; row++) + { + A->p[row] = row * n; + for (int col = 0; col < n; col++) + { + A->i[row * n + col] = col; + } + } + A->p[n] = n * n; + + expr *AX = new_left_matmul(X, A); + + double *x_vals = (double *) malloc(n * n * sizeof(double)); + for (int i = 0; i < n * n; i++) + { + x_vals[i] = 1.0; + } + + Timer timer; + clock_gettime(CLOCK_MONOTONIC, &timer.start); + AX->forward(AX, x_vals); + clock_gettime(CLOCK_MONOTONIC, &timer.end); + printf("left_matmul forward time: %8.3f seconds\n", GET_ELAPSED_SECONDS(timer)); + clock_gettime(CLOCK_MONOTONIC, &timer.start); + AX->jacobian_init(AX); + clock_gettime(CLOCK_MONOTONIC, &timer.end); + printf("left_matmul jacobian init time: %8.3f seconds\n", + GET_ELAPSED_SECONDS(timer)); + clock_gettime(CLOCK_MONOTONIC, &timer.start); + AX->eval_jacobian(AX); + clock_gettime(CLOCK_MONOTONIC, &timer.end); + printf("left_matmul jacobian eval time: %8.3f seconds\n", + GET_ELAPSED_SECONDS(timer)); + + free(x_vals); + free_csr_matrix(A); + free_expr(AX); + return 0; +} diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h new file mode 100644 index 0000000..c7daeb6 --- /dev/null +++ b/tests/utils/test_csr_csc_conversion.h @@ -0,0 +1,171 @@ +#include +#include +#include +#include + +#include "minunit.h" +#include "test_helpers.h" +#include "utils/CSC_Matrix.h" +#include "utils/CSR_Matrix.h" + +/* Test CSR to CSC conversion with fill_sparsity and fill_values */ +const char *test_csr_to_csc_split() +{ + /* Create a 4x5 CSR matrix A: + * [1.0 0.0 0.0 0.0 1.0] + * [0.0 0.0 3.0 0.0 0.0] + * [0.0 2.0 0.0 0.0 0.0] + * [0.0 0.0 0.0 4.0 0.0] + */ + CSR_Matrix *A = new_csr_matrix(4, 5, 5); + double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0}; + int Ai[5] = {0, 4, 2, 1, 3}; + int Ap[5] = {0, 2, 3, 4, 5}; + memcpy(A->x, Ax, 5 * sizeof(double)); + memcpy(A->i, Ai, 5 * sizeof(int)); + memcpy(A->p, Ap, 5 * sizeof(int)); + + /* Allocate workspace */ + int *iwork = (int *) malloc(A->n * sizeof(int)); + + /* First, fill sparsity pattern */ + CSC_Matrix *C = csr_to_csc_fill_sparsity(A, iwork); + + /* Check sparsity pattern */ + int Cp_correct[6] = {0, 1, 2, 3, 4, 5}; + int Ci_correct[5] = {0, 2, 1, 3, 0}; + + mu_assert("C col pointers incorrect", cmp_int_array(C->p, Cp_correct, 6)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, Ci_correct, 5)); + + /* Now fill values */ + csr_to_csc_fill_values(A, C, iwork); + + /* Check values */ + double Cx_correct[5] = {1.0, 2.0, 3.0, 4.0, 1.0}; + + mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5)); + + free(iwork); + free_csr_matrix(A); + free_csc_matrix(C); + + return 0; +} + +/* Test CSC to CSR conversion with fill_sparsity */ +const char *test_csc_to_csr_sparsity() +{ + /* Create a 4x5 CSC matrix A: + * [1.0 0.0 0.0 0.0 2.0] + * [0.0 0.0 3.0 0.0 0.0] + * [0.0 4.0 0.0 0.0 0.0] + * [0.0 0.0 0.0 5.0 0.0] + */ + CSC_Matrix *A = new_csc_matrix(4, 5, 5); + double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0}; + int Ai[5] = {0, 2, 1, 3, 0}; + int Ap[6] = {0, 1, 2, 3, 4, 5}; + memcpy(A->x, Ax, 5 * sizeof(double)); + memcpy(A->i, Ai, 5 * sizeof(int)); + memcpy(A->p, Ap, 6 * sizeof(int)); + + /* Allocate workspace */ + int *iwork = (int *) malloc(A->m * sizeof(int)); + + /* Fill sparsity pattern */ + CSR_Matrix *C = csc_to_csr_fill_sparsity(A, iwork); + + /* Expected CSR format: + * Row 0: [1.0 at col 0, 2.0 at col 4] + * Row 1: [3.0 at col 2] + * Row 2: [4.0 at col 1] + * Row 3: [5.0 at col 3] + */ + int Cp_correct[5] = {0, 2, 3, 4, 5}; + int Ci_correct[5] = {0, 4, 2, 1, 3}; + + mu_assert("C row pointers incorrect", cmp_int_array(C->p, Cp_correct, 5)); + mu_assert("C col indices incorrect", cmp_int_array(C->i, Ci_correct, 5)); + mu_assert("C dimensions incorrect", C->m == 4 && C->n == 5); + mu_assert("C nnz incorrect", C->nnz == 5); + + free(iwork); + free_csc_matrix(A); + free_csr_matrix(C); + + return 0; +} + +/* Test CSC to CSR conversion with fill_values */ +const char *test_csc_to_csr_values() +{ + /* Create a 4x5 CSC matrix A */ + CSC_Matrix *A = new_csc_matrix(4, 5, 5); + double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0}; + int Ai[5] = {0, 2, 1, 3, 0}; + int Ap[6] = {0, 1, 2, 3, 4, 5}; + memcpy(A->x, Ax, 5 * sizeof(double)); + memcpy(A->i, Ai, 5 * sizeof(int)); + memcpy(A->p, Ap, 6 * sizeof(int)); + + /* Allocate workspace */ + int *iwork = (int *) malloc(A->m * sizeof(int)); + + /* Fill sparsity pattern */ + CSR_Matrix *C = csc_to_csr_fill_sparsity(A, iwork); + + /* Fill values */ + csc_to_csr_fill_values(A, C, iwork); + + /* Check values */ + double Cx_correct[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + + mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5)); + + free(iwork); + free_csc_matrix(A); + free_csr_matrix(C); + + return 0; +} + +/* Test round-trip conversion: CSR -> CSC -> CSR */ +const char *test_csr_csc_csr_roundtrip() +{ + /* Create a 3x4 CSR matrix A: + * [1.0 2.0 0.0 3.0] + * [0.0 4.0 5.0 0.0] + * [6.0 0.0 7.0 8.0] + */ + CSR_Matrix *A = new_csr_matrix(3, 4, 8); + double Ax[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + int Ai[8] = {0, 1, 3, 1, 2, 0, 2, 3}; + int Ap[4] = {0, 3, 5, 8}; + memcpy(A->x, Ax, 8 * sizeof(double)); + memcpy(A->i, Ai, 8 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* Convert CSR to CSC */ + int *iwork_csc = (int *) malloc(A->n * sizeof(int)); + CSC_Matrix *B = csr_to_csc_fill_sparsity(A, iwork_csc); + csr_to_csc_fill_values(A, B, iwork_csc); + + /* Convert CSC back to CSR */ + int *iwork_csr = (int *) malloc(B->m * sizeof(int)); + CSR_Matrix *C = csc_to_csr_fill_sparsity(B, iwork_csr); + csc_to_csr_fill_values(B, C, iwork_csr); + + /* C should match A */ + mu_assert("Round-trip: vals incorrect", cmp_double_array(C->x, Ax, 8)); + mu_assert("Round-trip: col indices incorrect", cmp_int_array(C->i, Ai, 8)); + mu_assert("Round-trip: row pointers incorrect", cmp_int_array(C->p, Ap, 4)); + + free(iwork_csc); + free(iwork_csr); + free_csr_matrix(A); + free_csc_matrix(B); + free_csr_matrix(C); + + return 0; +} diff --git a/tests/utils/test_linalg.h b/tests/utils/test_linalg.h new file mode 100644 index 0000000..e7a6896 --- /dev/null +++ b/tests/utils/test_linalg.h @@ -0,0 +1,398 @@ +#include +#include +#include +#include + +#include "minunit.h" +#include "test_helpers.h" +#include "utils/CSC_Matrix.h" +#include "utils/CSR_Matrix.h" +#include "utils/linalg.h" + +/* Test block_left_multiply_fill_sparsity with simple case: single block */ +const char *test_block_left_multiply_single_block() +{ + /* A is 2x3 CSR: + * [1.0 0.0 0.0] + * [0.0 1.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 3); + double Ax[3] = {1.0, 1.0, 1.0}; + int Ai[3] = {0, 1, 2}; + int Ap[3] = {0, 1, 3}; + memcpy(A->x, Ax, 3 * sizeof(double)); + memcpy(A->i, Ai, 3 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* J is 3x2 CSC (single block, so p=1): + * [1.0 0.0] + * [1.0 0.0] + * [0.0 1.0] + */ + CSC_Matrix *J = new_csc_matrix(3, 2, 3); + double Jx[3] = {1.0, 1.0, 1.0}; + int Ji[3] = {0, 1, 2}; + int Jp[3] = {0, 2, 3}; + memcpy(J->x, Jx, 3 * sizeof(double)); + memcpy(J->i, Ji, 3 * sizeof(int)); + memcpy(J->p, Jp, 3 * sizeof(int)); + + /* Compute C = A @ J1 (p=1 means just one block) */ + CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1); + + /* Expected C is 2x2: + * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has + * row 0) C[1,0] = A[1,:] @ J[:,0] = 1.0*1.0 + 1.0*0.0 = 1.0 (row 1 has columns + * 1,2, J col 0 has row 1) C[0,1] = A[0,:] @ J[:,1] = 0.0 (row 0 has column 0, J + * col 1 has row 2) C[1,1] = A[1,:] @ J[:,1] = 1.0*1.0 = 1.0 (row 1 has columns + * 1,2, J col 1 has row 2) + */ + int expected_p[3] = {0, 2, 3}; + int expected_i[3] = {0, 1, 1}; + + mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 3); + mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p, 3)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i, 3)); + + free_csc_matrix(C); + free_csr_matrix(A); + free_csc_matrix(J); + return NULL; +} + +/* Test block_left_multiply_fill_sparsity with two blocks */ +const char *test_block_left_multiply_two_blocks() +{ + /* A is 2x2 CSR: + * [1.0 0.0] + * [0.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 2, 2); + double Ax[2] = {1.0, 1.0}; + int Ai[2] = {0, 1}; + int Ap[3] = {0, 1, 2}; + memcpy(A->x, Ax, 2 * sizeof(double)); + memcpy(A->i, Ai, 2 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* J is 4x3 CSC (two blocks of 2 rows each): + * Block 1 rows [0,1]: + * [1.0 0.0 0.0] + * [0.0 0.0 0.0] + * Block 2 rows [2,3]: + * [0.0 1.0 0.0] + * [0.0 0.0 1.0] + * So J is: + * [1.0 0.0 0.0] + * [0.0 0.0 0.0] + * [0.0 1.0 0.0] + * [0.0 0.0 1.0] + */ + CSC_Matrix *J = new_csc_matrix(4, 3, 3); + double Jx[3] = {1.0, 1.0, 1.0}; + int Ji[3] = {0, 2, 3}; + int Jp[4] = {0, 1, 2, 3}; + memcpy(J->x, Jx, 3 * sizeof(double)); + memcpy(J->i, Ji, 3 * sizeof(int)); + memcpy(J->p, Jp, 4 * sizeof(int)); + + /* Compute C = [A @ J1; A @ J2] where: + * J1 = [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + * J2 = [[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] + * + * C = [A @ J1; A @ J2] is 4x3: + * A @ J1 = [[1, 0, 0], [0, 0, 0]] (row 0 of A matches col 0 of J1) + * A @ J2 = [[0, 0, 0], [0, 1, 1]] (row 1 of A matches cols 1,2 of J2) + * So C is: + * [1.0 0.0 0.0] + * [0.0 0.0 0.0] + * [0.0 0.0 0.0] + * [0.0 1.0 1.0] + */ + CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 2); + block_left_multiply_fill_values(A, J, C); + + int expected_p2[4] = {0, 1, 2, 3}; + int expected_i2[3] = {0, 2, 3}; + double expected_x2[3] = {1.0, 1.0, 1.0}; + + mu_assert("C dims incorrect", C->m == 4 && C->n == 3 && C->nnz == 3); + mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p2, 4)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i2, 3)); + mu_assert("C values incorrect", cmp_double_array(C->x, expected_x2, 3)); + + free_csc_matrix(C); + free_csr_matrix(A); + free_csc_matrix(J); + return NULL; +} + +/* Test block_left_multiply_fill_sparsity with all zero column in J */ +const char *test_block_left_multiply_zero_column() +{ + /* A is 2x2 CSR (identity) */ + CSR_Matrix *A = new_csr_matrix(2, 2, 2); + double Ax[2] = {1.0, 1.0}; + int Ai[2] = {0, 1}; + int Ap[3] = {0, 1, 2}; + memcpy(A->x, Ax, 2 * sizeof(double)); + memcpy(A->i, Ai, 2 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* J is 2x2 with an empty column: + * [1.0 0.0] + * [0.0 0.0] + */ + CSC_Matrix *J = new_csc_matrix(2, 2, 1); + double Jx[1] = {1.0}; + int Ji[1] = {0}; + int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */ + memcpy(J->x, Jx, 1 * sizeof(double)); + memcpy(J->i, Ji, 1 * sizeof(int)); + memcpy(J->p, Jp, 3 * sizeof(int)); + + CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1); + + int expected_p3[3] = {0, 1, 1}; + int expected_i3[1] = {0}; + + mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 1); + mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p3, 3)); + mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i3, 1)); + + free_csc_matrix(C); + free_csr_matrix(A); + free_csc_matrix(J); + return NULL; +} + +/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR and B is CSC */ +const char *test_csr_csc_matmul_alloc_basic() +{ + /* A is 3x2 CSR: + * [1.0 0.0] + * [0.0 1.0] + * [1.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(3, 2, 4); + double Ax[4] = {1.0, 1.0, 1.0, 1.0}; + int Ai[4] = {0, 1, 0, 1}; + int Ap[4] = {0, 1, 2, 4}; + memcpy(A->x, Ax, 4 * sizeof(double)); + memcpy(A->i, Ai, 4 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* B is 2x3 CSC: + * [1.0 0.0 1.0] + * [0.0 1.0 1.0] + */ + CSC_Matrix *B = new_csc_matrix(2, 3, 4); + double Bx[4] = {1.0, 1.0, 1.0, 1.0}; + int Bi[4] = {0, 1, 0, 1}; + int Bp[4] = {0, 1, 2, 4}; + memcpy(B->x, Bx, 4 * sizeof(double)); + memcpy(B->i, Bi, 4 * sizeof(int)); + memcpy(B->p, Bp, 4 * sizeof(int)); + + /* C = A @ B is 3x3: + * C = [[1, 0, 1], + * [0, 1, 1], + * [1, 1, 2]] + */ + CSR_Matrix *C = csr_csc_matmul_alloc(A, B); + + int expected_p4[4] = {0, 2, 4, 7}; + int expected_i4[7] = {0, 2, 1, 2, 0, 1, 2}; + + mu_assert("C dims incorrect", C->m == 3 && C->n == 3 && C->nnz == 7); + mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p4, 4)); + mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i4, 7)); + + free_csr_matrix(C); + free_csr_matrix(A); + free_csc_matrix(B); + return NULL; +} + +/* Test csr_csc_matmul_alloc with sparse result */ +const char *test_csr_csc_matmul_alloc_sparse() +{ + /* A is 2x3 CSR: + * [1.0 0.0 0.0] + * [0.0 0.0 1.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 2); + double Ax[2] = {1.0, 1.0}; + int Ai[2] = {0, 2}; + int Ap[3] = {0, 1, 2}; + memcpy(A->x, Ax, 2 * sizeof(double)); + memcpy(A->i, Ai, 2 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* B is 3x2 CSC: + * [1.0 0.0] + * [0.0 0.0] + * [0.0 1.0] + */ + CSC_Matrix *B = new_csc_matrix(3, 2, 2); + double Bx[2] = {1.0, 1.0}; + int Bi[2] = {0, 2}; + int Bp[3] = {0, 1, 2}; + memcpy(B->x, Bx, 2 * sizeof(double)); + memcpy(B->i, Bi, 2 * sizeof(int)); + memcpy(B->p, Bp, 3 * sizeof(int)); + + /* C = A @ B is 2x2: + * C = [[1, 0], + * [0, 1]] + */ + CSR_Matrix *C = csr_csc_matmul_alloc(A, B); + + int expected_p5[3] = {0, 1, 2}; + int expected_i5[2] = {0, 1}; + + mu_assert("C dims incorrect", C->m == 2 && C->n == 2 && C->nnz == 2); + mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p5, 3)); + mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i5, 2)); + + free_csr_matrix(C); + free_csr_matrix(A); + free_csc_matrix(B); + return NULL; +} + +/* Test block_left_multiply_vec with single block: y = A @ x */ +const char *test_block_left_multiply_vec_single_block() +{ + /* A is 2x3 CSR: + * [1.0 0.0 2.0] + * [0.0 3.0 0.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 3); + double Ax[3] = {1.0, 3.0, 2.0}; + int Ai[3] = {0, 1, 2}; + int Ap[3] = {0, 2, 3}; + memcpy(A->x, Ax, 3 * sizeof(double)); + memcpy(A->i, Ai, 3 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* x is (3*1)-length vector = [1.0, 2.0, 3.0] */ + double x[3] = {1.0, 2.0, 3.0}; + double y[2] = {0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 1); + + /* Expected: y = [1.0*1.0 + 0.0*2.0 + 2.0*3.0, 0.0*1.0 + 3.0*2.0 + 0.0*3.0] + * = [1.0 + 6.0, 6.0] + * = [7.0, 6.0] + */ + double expected_y[2] = {7.0, 6.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 2)); + + free_csr_matrix(A); + return NULL; +} + +/* Test block_left_multiply_vec with two blocks: y = [A @ x1; A @ x2] */ +const char *test_block_left_multiply_vec_two_blocks() +{ + /* A is 2x3 CSR: + * [1.0 2.0 0.0] + * [0.0 3.0 4.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 3, 4); + double Ax[4] = {1.0, 2.0, 3.0, 4.0}; + int Ai[4] = {0, 1, 1, 2}; + int Ap[3] = {0, 2, 4}; + memcpy(A->x, Ax, 4 * sizeof(double)); + memcpy(A->i, Ai, 4 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* x is (3*2)-length vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + * x1 = [1.0, 2.0, 3.0], x2 = [4.0, 5.0, 6.0] + */ + double x[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + double y[4] = {0.0, 0.0, 0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 2); + + /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*2.0 + 4.0*3.0] = + * [5.0, 18.0] Expected block 2: y[2:4] = A @ x2 = [1.0*4.0 + 2.0*5.0, 3.0*5.0 + * + 4.0*6.0] = [14.0, 39.0] + */ + double expected_y[4] = {5.0, 18.0, 14.0, 39.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 4)); + + free_csr_matrix(A); + return NULL; +} + +/* Test block_left_multiply_vec with sparse matrix and multiple blocks */ +const char *test_block_left_multiply_vec_sparse() +{ + /* A is 3x4 CSR (very sparse): + * [2.0 0.0 0.0 0.0] + * [0.0 0.0 3.0 0.0] + * [0.0 0.0 0.0 4.0] + */ + CSR_Matrix *A = new_csr_matrix(3, 4, 3); + double Ax[3] = {2.0, 3.0, 4.0}; + int Ai[3] = {0, 2, 3}; + int Ap[4] = {0, 1, 2, 3}; + memcpy(A->x, Ax, 3 * sizeof(double)); + memcpy(A->i, Ai, 3 * sizeof(int)); + memcpy(A->p, Ap, 4 * sizeof(int)); + + /* x is (4*2)-length = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] + * x1 = [1.0, 2.0, 3.0, 4.0], x2 = [5.0, 6.0, 7.0, 8.0] + */ + double x[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + double y[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 2); + + /* Expected block 1: y[0:3] = A @ x1 = [2.0*1.0, 3.0*3.0, 4.0*4.0] = + * [2.0, 9.0, 16.0] Expected block 2: y[3:6] = A @ x2 = + * [2.0*5.0, 3.0*7.0, 4.0*8.0] = [10.0, 21.0, 32.0] + */ + double expected_y[6] = {2.0, 9.0, 16.0, 10.0, 21.0, 32.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6)); + + free_csr_matrix(A); + return NULL; +} + +/* Test block_left_multiply_vec with three blocks */ +const char *test_block_left_multiply_vec_three_blocks() +{ + /* A is 2x2 CSR: + * [1.0 2.0] + * [3.0 4.0] + */ + CSR_Matrix *A = new_csr_matrix(2, 2, 4); + double Ax[4] = {1.0, 2.0, 3.0, 4.0}; + int Ai[4] = {0, 1, 0, 1}; + int Ap[3] = {0, 2, 4}; + memcpy(A->x, Ax, 4 * sizeof(double)); + memcpy(A->i, Ai, 4 * sizeof(int)); + memcpy(A->p, Ap, 3 * sizeof(int)); + + /* x is (2*3)-length = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + * x1 = [1.0, 2.0], x2 = [3.0, 4.0], x3 = [5.0, 6.0] + */ + double x[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + double y[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + block_left_multiply_vec(A, x, y, 3); + + /* Expected block 1: y[0:2] = A @ x1 = [1.0*1.0 + 2.0*2.0, 3.0*1.0 + 4.0*2.0] = + * [5.0, 11.0] Expected block 2: y[2:4] = A @ x2 = [1.0*3.0 + 2.0*4.0, 3.0*3.0 + * + 4.0*4.0] = [11.0, 25.0] Expected block 3: y[4:6] = A @ x3 = [1.0*5.0 + * + 2.0*6.0, 3.0*5.0 + 4.0*6.0] = [17.0, 39.0] + */ + double expected_y[6] = {5.0, 11.0, 11.0, 25.0, 17.0, 39.0}; + mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6)); + + free_csr_matrix(A); + return NULL; +}