diff --git a/include/bivariate.h b/include/bivariate.h
index 77623e1..4a3e674 100644
--- a/include/bivariate.h
+++ b/include/bivariate.h
@@ -10,4 +10,10 @@ expr *new_quad_over_lin(expr *left, expr *right);
 expr *new_rel_entr_first_arg_scalar(expr *left, expr *right);
 expr *new_rel_entr_second_arg_scalar(expr *left, expr *right);
 
+/* Left matrix multiplication: A @ f(x) where A is a constant matrix */
+expr *new_left_matmul(expr *u, const CSR_Matrix *A);
+
+/* Right matrix multiplication: f(x) @ A where A is a constant matrix */
+expr *new_right_matmul(expr *u, const CSR_Matrix *A);
+
 #endif /* BIVARIATE_H */
diff --git a/include/subexpr.h b/include/subexpr.h
index 1b6d6a2..4368662 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -73,4 +73,26 @@ typedef struct elementwise_mult_expr
     CSR_Matrix *CSR_work2;
 } elementwise_mult_expr;
 
+/* Left matrix multiplication: y = A * f(x) where f(x) is an expression. Note that
+here A does not have global column indices but it is a local matrix. This is an
+important distinction compared to linear_op_expr. */
+typedef struct left_matmul_expr
+{
+    expr base;
+    CSR_Matrix *A;
+    CSR_Matrix *AT;
+    CSC_Matrix *CSC_work;
+} left_matmul_expr;
+
+/* Right matrix multiplication: y = f(x) * A where f(x) is an expression.
+ * f(x) has shape p x n, A has shape n x q, output y has shape p x q.
+ * Uses vec(y) = B * vec(f(x)) where B = A^T kron I_p. */
+typedef struct right_matmul_expr
+{
+    expr base;
+    CSR_Matrix *B;  /* B = A^T kron I_p */
+    CSR_Matrix *BT; /* B^T for backpropagating Hessian weights */
+    CSC_Matrix *CSC_work;
+} right_matmul_expr;
+
 #endif /* SUBEXPR_H */
diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_Matrix.h
index 51e6f6a..ed044dc 100644
--- a/include/utils/CSC_Matrix.h
+++ b/include/utils/CSC_Matrix.h
@@ -49,4 +49,18 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
  */
 void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C);
 
+CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork);
+void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);
+
+/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC
+ * Precomputes sparsity pattern. No workspace required.
+ */
+CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);
+
+/* Fill values of C = A @ B where A is CSR, B is CSC
+ * C must have sparsity pattern already computed
+ */
+void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
+                                CSR_Matrix *C);
+
 #endif /* CSC_MATRIX_H */
diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_Matrix.h
index 6dbeb24..94a3b0d 100644
--- a/include/utils/CSR_Matrix.h
+++ b/include/utils/CSR_Matrix.h
@@ -34,9 +34,18 @@ void free_csr_matrix(CSR_Matrix *matrix);
 /* Copy CSR matrix A to C */
 void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C);
 
+/* Build block-diagonal repeat A_blk = I_p kron A. Returns newly allocated CSR
+ * matrix of size (p*A->m) x (p*A->n) with nnz = p*A->nnz. */
+CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p);
+
+/* Build left-repeated Kronecker A_kron = A kron I_p. Returns newly allocated CSR
+ * matrix of size (A->m * p) x (A->n * p) with nnz = A->nnz * p. */
+CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p);
+
 /* matvec y = Ax, where A indices minus col_offset gives x indices. Returns y as
  * dense. */
 void csr_matvec(const CSR_Matrix *A, const double *x, double *y, int col_offset);
+void csr_matvec_wo_offset(const CSR_Matrix *A, const double *x, double *y);
 
 /* C = z^T A is assumed to have one row. C must have column indices pre-computed
 and transposed matrix AT must be provided. Fills in values of C only.
@@ -141,6 +150,7 @@ void insert_idx(int idx, int *arr, int len);
 
 double csr_get_value(const CSR_Matrix *A, int row, int col);
 
+/* iwork must be of size A->n*/
 CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork);
 CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork);
 
diff --git a/src/affine/linear_op.c b/src/affine/linear_op.c
index 4e9caa8..89a258a 100644
--- a/src/affine/linear_op.c
+++ b/src/affine/linear_op.c
@@ -1,4 +1,5 @@
 #include "affine.h"
+#include <assert.h>
 #include <stdlib.h>
 
 static void forward(expr *node, const double *u)
@@ -9,7 +10,7 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A * x */
-    csr_matvec(node->jacobian, x->value, node->value, x->var_id);
+    csr_matvec(((linear_op_expr *) node)->A_csr, x->value, node->value, x->var_id);
 }
 
 static bool is_affine(const expr *node)
@@ -20,21 +21,31 @@ static bool is_affine(const expr *node)
 static void free_type_data(expr *node)
 {
     linear_op_expr *lin_node = (linear_op_expr *) node;
-    /* memory pointing to by A_csr will already be freed when the
-       jacobian is freed, so free_csr_matrix(lin_node->A_csr) should
-       be commented out */
-    // free_csr_matrix(lin_node->A_csr);
+    /* memory pointing to by A_csr will be freed when the jacobian is freed,
+       so if the jacobian is not null we must not free A_csr. */
+
+    if (!node->jacobian)
+    {
+        free_csr_matrix(lin_node->A_csr);
+    }
+
     free_csc_matrix(lin_node->A_csc);
     lin_node->A_csr = NULL;
     lin_node->A_csc = NULL;
 }
 
+static void jacobian_init(expr *node)
+{
+    node->jacobian = ((linear_op_expr *) node)->A_csr;
+}
+
 expr *new_linear(expr *u, const CSR_Matrix *A)
 {
+    assert(u->d2 == 1);
     /* Allocate the type-specific struct */
     linear_op_expr *lin_node = (linear_op_expr *) calloc(1, sizeof(linear_op_expr));
     expr *node = &lin_node->base;
-    init_expr(node, A->m, 1, u->n_vars, forward, NULL, NULL, is_affine,
+    init_expr(node, A->m, 1, u->n_vars, forward, jacobian_init, NULL, is_affine,
               free_type_data);
     node->left = u;
     expr_retain(u);
@@ -44,8 +55,5 @@ expr *new_linear(expr *u, const CSR_Matrix *A)
     copy_csr_matrix(A, lin_node->A_csr);
     lin_node->A_csc = csr_to_csc(A);
 
-    /* what if we have A @ phi(x). Then I don't think this is correct. */
-    node->jacobian = lin_node->A_csr;
-
     return node;
 }
diff --git a/src/bivariate/left_matmul.c b/src/bivariate/left_matmul.c
new file mode 100644
index 0000000..517f48c
--- /dev/null
+++ b/src/bivariate/left_matmul.c
@@ -0,0 +1,136 @@
+#include "bivariate.h"
+#include "subexpr.h"
+#include <stdlib.h>
+
+/* This file implement the atom 'left_matmul' corresponding to the operation y =
+   A @ f(x), where A is a given matrix and f(x) is an arbitrary expression.
+   Here, f(x) can be a vector-valued expression and a matrix-valued
+   expression. The dimensions are A - m x n, f(x) - n x p, y - m x p.
+   Note that here A does not have global column indices but it is a local matrix.
+   This is an important distinction compared to linear_op_expr.
+
+   * To compute the forward pass: vec(y) = A_kron @ vec(f(x)),
+     where A_kron = I_p kron A is a Kronecker product of size (m*p) x (n*p),
+     or more specificely, a block-diagonal matrix with p blocks of A along the
+     diagonal.
+
+   * To compute the Jacobian: J_y = A_kron @ J_f(x), where J_f(x) is the
+     Jacobian of f(x) of size (n*p) x n_vars.
+
+    * To compute the contribution to the Lagrange Hessian: we form
+    w = A_kron^T @ lambda and then evaluate the hessian of f(x).
+
+    Working in terms of A_kron unifies the implementation of f(x) being
+    vector-valued or matrix-valued.
+
+
+*/
+
+// todo: put this in common somewhere
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static void forward(expr *node, const double *u)
+{
+    expr *x = node->left;
+
+    /* child's forward pass */
+    node->left->forward(node->left, u);
+
+    /* y = A_kron @ vec(f(x)) */
+    csr_matvec_wo_offset(((left_matmul_expr *) node)->A, x->value, node->value);
+}
+
+static bool is_affine(const expr *node)
+{
+    return node->left->is_affine(node->left);
+}
+
+static void free_type_data(expr *node)
+{
+    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+    free_csr_matrix(lin_node->A);
+    free_csr_matrix(lin_node->AT);
+    if (lin_node->CSC_work)
+    {
+        free_csc_matrix(lin_node->CSC_work);
+    }
+    lin_node->A = NULL;
+    lin_node->AT = NULL;
+    lin_node->CSC_work = NULL;
+}
+
+static void jacobian_init(expr *node)
+{
+    expr *x = node->left;
+    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+
+    /* initialize child's jacobian and precompute sparsity of its transpose */
+    x->jacobian_init(x);
+    lin_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+
+    /* precompute sparsity of this node's jacobian */
+    node->jacobian = csr_csc_matmul_alloc(lin_node->A, lin_node->CSC_work);
+}
+
+static void eval_jacobian(expr *node)
+{
+    expr *x = node->left;
+    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+
+    /* evaluate child's jacobian and convert to CSC */
+    x->eval_jacobian(x);
+    csr_to_csc_fill_values(x->jacobian, lin_node->CSC_work, node->iwork);
+
+    /* compute this node's jacobian */
+    csr_csc_matmul_fill_values(lin_node->A, lin_node->CSC_work, node->jacobian);
+}
+
+static void wsum_hess_init(expr *node)
+{
+    /* initialize child's hessian */
+    expr *x = node->left;
+    x->wsum_hess_init(x);
+
+    /* allocate this node's hessian with the same sparsity as child's */
+    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, x->wsum_hess->nnz);
+    memcpy(node->wsum_hess->p, x->wsum_hess->p, (node->n_vars + 1) * sizeof(int));
+    memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));
+
+    /* work for computing A^T w*/
+    int A_n = ((left_matmul_expr *) node)->A->n;
+    node->dwork = (double *) malloc(A_n * sizeof(double));
+}
+
+static void eval_wsum_hess(expr *node, const double *w)
+{
+    /* compute A^T w*/
+    left_matmul_expr *lin_node = (left_matmul_expr *) node;
+    csr_matvec_wo_offset(lin_node->AT, w, node->dwork);
+
+    node->left->eval_wsum_hess(node->left, node->dwork);
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
+}
+
+expr *new_left_matmul(expr *u, const CSR_Matrix *A)
+{
+    /* Allocate the type-specific struct */
+    left_matmul_expr *lin_node =
+        (left_matmul_expr *) calloc(1, sizeof(left_matmul_expr));
+    expr *node = &lin_node->base;
+    init_expr(node, A->m, u->d2, u->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, free_type_data);
+    node->left = u;
+    expr_retain(u);
+    node->wsum_hess_init = wsum_hess_init;
+    node->eval_wsum_hess = eval_wsum_hess;
+
+    /* Initialize type-specific fields */
+    lin_node->A = block_diag_repeat_csr(A, node->d2);
+    int alloc = MAX(lin_node->A->n, node->n_vars);
+    node->iwork = (int *) malloc(alloc * sizeof(int));
+    lin_node->AT = transpose(lin_node->A, node->iwork);
+    lin_node->CSC_work = NULL;
+
+    return node;
+}
diff --git a/src/bivariate/multiply.c b/src/bivariate/multiply.c
index 63c9290..a6f9667 100644
--- a/src/bivariate/multiply.c
+++ b/src/bivariate/multiply.c
@@ -29,18 +29,8 @@ static void forward(expr *node, const double *u)
 
 static void jacobian_init(expr *node)
 {
-    /* if a child is a variable we initialize its jacobian for a
-       short chain rule implementation */
-    if (node->left->var_id != NOT_A_VARIABLE)
-    {
-        node->left->jacobian_init(node->left);
-    }
-
-    if (node->right->var_id != NOT_A_VARIABLE)
-    {
-        node->right->jacobian_init(node->right);
-    }
-
+    node->left->jacobian_init(node->left);
+    node->right->jacobian_init(node->right);
     node->dwork = (double *) malloc(2 * node->size * sizeof(double));
     int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
     node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
@@ -180,6 +170,9 @@ static void eval_wsum_hess(expr *node, const double *w)
         /* Compute CT = C^T = A^T diag(w) B */
         AT_fill_values(C, CT, node->iwork);
 
+        /* TODO: should fill sparsity before. Maybe we can map values from C to
+         * hessian directly instead?*/
+
         /* Hessian = C + CT = B^T diag(w) A + A^T diag(w) B */
         sum_csr_matrices(C, CT, node->wsum_hess);
     }
diff --git a/src/bivariate/quad_over_lin.c b/src/bivariate/quad_over_lin.c
index adc4a2b..41b8dc5 100644
--- a/src/bivariate/quad_over_lin.c
+++ b/src/bivariate/quad_over_lin.c
@@ -68,7 +68,7 @@ static void jacobian_init(expr *node)
         /* compute required allocation and allocate jacobian */
         bool *col_nz = (bool *) calloc(
             node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/
-        int nonzero_cols = count_nonzero_cols(lin_x->base.jacobian, col_nz);
+        int nonzero_cols = count_nonzero_cols(lin_x->A_csr, col_nz);
         node->jacobian = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
 
         /* precompute column indices */
diff --git a/src/bivariate/right_matmul.c b/src/bivariate/right_matmul.c
new file mode 100644
index 0000000..0c349fb
--- /dev/null
+++ b/src/bivariate/right_matmul.c
@@ -0,0 +1,135 @@
+#include "bivariate.h"
+#include "subexpr.h"
+#include <stdlib.h>
+
+/* This file implements the atom 'right_matmul' corresponding to the operation y =
+   f(x) @ A, where A is a given matrix and f(x) is an arbitrary expression.
+   Here, f(x) can be a vector-valued expression and a matrix-valued
+   expression. The dimensions are f(x) - p x n, A - n x q, y - p x q.
+   Note that here A does not have global column indices but it is a local matrix.
+   This is an important distinction compared to linear_op_expr.
+
+   * To compute the forward pass: vec(y) = B @ vec(f(x)),
+     where B = A^T kron I_p is a Kronecker product of size (p*q) x (p*n).
+
+   * To compute the Jacobian: J_y = B @ J_f(x), where J_f(x) is the
+     Jacobian of f(x) of size (p*n) x n_vars.
+
+    * To compute the contribution to the Lagrange Hessian: we form
+    w_child = B^T @ w and then evaluate the hessian of f(x).
+*/
+
+static void forward(expr *node, const double *u)
+{
+    expr *x = node->left;
+
+    /* child's forward pass */
+    node->left->forward(node->left, u);
+
+    /* y = x * A, vec(y) = B @ vec(x) */
+    csr_matvec_wo_offset(((right_matmul_expr *) node)->B, x->value, node->value);
+}
+
+static bool is_affine(const expr *node)
+{
+    return node->left->is_affine(node->left);
+}
+
+static void free_type_data(expr *node)
+{
+    right_matmul_expr *right_node = (right_matmul_expr *) node;
+    free_csr_matrix(right_node->B);
+    free_csr_matrix(right_node->BT);
+    if (right_node->CSC_work)
+    {
+        free_csc_matrix(right_node->CSC_work);
+    }
+    right_node->B = NULL;
+    right_node->BT = NULL;
+    right_node->CSC_work = NULL;
+}
+
+static void jacobian_init(expr *node)
+{
+    expr *x = node->left;
+    right_matmul_expr *right_node = (right_matmul_expr *) node;
+
+    /* initialize child's jacobian and precompute sparsity of its transpose */
+    x->jacobian_init(x);
+    right_node->CSC_work = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+
+    /* precompute sparsity of this node's jacobian */
+    node->jacobian = csr_csc_matmul_alloc(right_node->B, right_node->CSC_work);
+}
+
+static void eval_jacobian(expr *node)
+{
+    expr *x = node->left;
+    right_matmul_expr *right_node = (right_matmul_expr *) node;
+
+    /* evaluate child's jacobian and convert to CSC*/
+    x->eval_jacobian(x);
+    csr_to_csc_fill_values(x->jacobian, right_node->CSC_work, node->iwork);
+
+    /* compute this node's jacobian */
+    csr_csc_matmul_fill_values(right_node->B, right_node->CSC_work, node->jacobian);
+}
+
+static void wsum_hess_init(expr *node)
+{
+    /* initialize child's hessian */
+    expr *x = node->left;
+    x->wsum_hess_init(x);
+
+    /* allocate this node's hessian with the same sparsity as child's */
+    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, x->wsum_hess->nnz);
+    memcpy(node->wsum_hess->p, x->wsum_hess->p, (node->n_vars + 1) * sizeof(int));
+    memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));
+
+    /* Allocate workspace for B^T @ w */
+    node->dwork = (double *) malloc(x->d1 * x->d2 * sizeof(double));
+}
+
+static void eval_wsum_hess(expr *node, const double *w)
+{
+    /* Compute B^T @ w, where B = A^T ⊗ I_p */
+    right_matmul_expr *right_node = (right_matmul_expr *) node;
+
+    /* B^T @ w computes the weights for the child expression */
+    csr_matvec_wo_offset(right_node->BT, w, node->dwork);
+
+    /* Propagate to child */
+    node->left->eval_wsum_hess(node->left, node->dwork);
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
+}
+
+expr *new_right_matmul(expr *u, const CSR_Matrix *A)
+{
+    /* Allocate the type-specific struct */
+    right_matmul_expr *right_matmul_node =
+        (right_matmul_expr *) calloc(1, sizeof(right_matmul_expr));
+    expr *node = &right_matmul_node->base;
+
+    /* Output dimensions: u is p x n, A is n x q, output is p x q */
+    int p = u->d1;
+    int q = A->n;
+
+    init_expr(node, p, q, u->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, free_type_data);
+    node->left = u;
+    expr_retain(u);
+    node->wsum_hess_init = wsum_hess_init;
+    node->eval_wsum_hess = eval_wsum_hess;
+
+    /* create B = A^T kron I_p and its transpose */
+    node->iwork = (int *) malloc(node->n_vars * sizeof(int));
+    CSR_Matrix *AT = transpose(A, node->iwork);
+    right_matmul_node->B = kron_identity_csr(AT, p);
+    right_matmul_node->BT = kron_identity_csr(A, p);
+    free_csr_matrix(AT);
+
+    right_matmul_node->CSC_work = NULL;
+
+    return node;
+}
diff --git a/src/elementwise_univariate/common.c b/src/elementwise_univariate/common.c
index c1c412e..2af054e 100644
--- a/src/elementwise_univariate/common.c
+++ b/src/elementwise_univariate/common.c
@@ -21,7 +21,7 @@ void jacobian_init_elementwise(expr *node)
     /* otherwise it will be a linear operator */
     else
     {
-        CSR_Matrix *J = child->jacobian;
+        CSR_Matrix *J = ((linear_op_expr *) child)->A_csr;
         node->jacobian = new_csr_matrix(J->m, J->n, J->nnz);
 
         node->dwork = (double *) malloc(node->size * sizeof(double));
diff --git a/src/expr.c b/src/expr.c
index fe6d1ac..3e8bad3 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -41,6 +41,12 @@ void free_expr(expr *node)
     node->left = NULL;
     node->right = NULL;
 
+    /* free type-specific data */
+    if (node->free_type_data)
+    {
+        node->free_type_data(node);
+    }
+
     /* free value array and jacobian */
     free(node->value);
     free_csr_matrix(node->jacobian);
@@ -53,12 +59,6 @@ void free_expr(expr *node)
     node->dwork = NULL;
     node->iwork = NULL;
 
-    /* free type-specific data */
-    if (node->free_type_data)
-    {
-        node->free_type_data(node);
-    }
-
     /* free the node itself */
     free(node);
 }
diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_Matrix.c
index 841c2b3..d4389ab 100644
--- a/src/utils/CSC_Matrix.c
+++ b/src/utils/CSC_Matrix.c
@@ -122,6 +122,35 @@ static inline double sparse_wdot(const double *a_x, const int *a_i, int a_nnz,
     return sum;
 }
 
+/* Unweighted sparse dot product of two sorted index arrays */
+static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz,
+                                const double *b_x, const int *b_i, int b_nnz)
+{
+    int ii = 0;
+    int jj = 0;
+    double sum = 0.0;
+
+    while (ii < a_nnz && jj < b_nnz)
+    {
+        if (a_i[ii] == b_i[jj])
+        {
+            sum += a_x[ii] * b_x[jj];
+            ii++;
+            jj++;
+        }
+        else if (a_i[ii] < b_i[jj])
+        {
+            ii++;
+        }
+        else
+        {
+            jj++;
+        }
+    }
+
+    return sum;
+}
+
 void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C)
 {
     int i, j, ii, jj;
@@ -196,6 +225,70 @@ CSC_Matrix *csr_to_csc(const CSR_Matrix *A)
     free(count);
     return C;
 }
+
+CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork)
+{
+    CSC_Matrix *C = new_csc_matrix(A->m, A->n, A->nnz);
+
+    int i, j, start;
+    int *count = iwork;
+    memset(count, 0, A->n * sizeof(int));
+
+    // -------------------------------------------------------------------
+    //              compute nnz in each column of A
+    // -------------------------------------------------------------------
+    for (i = 0; i < A->m; ++i)
+    {
+        for (j = A->p[i]; j < A->p[i + 1]; ++j)
+        {
+            count[A->i[j]]++;
+        }
+    }
+
+    // ------------------------------------------------------------------
+    //                      compute column pointers
+    // ------------------------------------------------------------------
+    C->p[0] = 0;
+    for (i = 0; i < A->n; ++i)
+    {
+        C->p[i + 1] = C->p[i] + count[i];
+        count[i] = C->p[i];
+    }
+
+    // ------------------------------------------------------------------
+    //                         fill row indices
+    // ------------------------------------------------------------------
+    for (i = 0; i < A->m; ++i)
+    {
+        for (j = A->p[i]; j < A->p[i + 1]; ++j)
+        {
+            C->i[count[A->i[j]]] = i;
+            count[A->i[j]]++;
+        }
+    }
+
+    return C;
+}
+
+void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork)
+{
+    int i, j;
+    int *count = iwork;
+    memcpy(count, C->p, A->n * sizeof(int));
+
+    // ------------------------------------------------------------------
+    //                         fill values
+    // ------------------------------------------------------------------
+    for (i = 0; i < A->m; ++i)
+    {
+        for (j = A->p[i]; j < A->p[i + 1]; ++j)
+        {
+            C->x[count[A->i[j]]] = A->x[j];
+            count[A->i[j]]++;
+        }
+    }
+}
+
 CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
 {
     /* A is m x n, B is m x p, C = B^T A is p x n */
@@ -304,3 +397,88 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
         }
     }
 }
+
+static bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len)
+{
+    int ai = 0, bi = 0;
+    while (ai < a_len && bi < b_len)
+    {
+        if (a_idx[ai] == b_idx[bi]) return true;
+        if (a_idx[ai] < b_idx[bi])
+        {
+            ai++;
+        }
+        else
+        {
+            bi++;
+        }
+    }
+    return false;
+}
+
+/* Fill values of C = A @ B where A is CSR, B is CSC. */
+void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
+                                CSR_Matrix *C)
+{
+    for (int i = 0; i < A->m; i++)
+    {
+        for (int jj = C->p[i]; jj < C->p[i + 1]; jj++)
+        {
+            int j = C->i[jj];
+
+            int a_nnz = A->p[i + 1] - A->p[i];
+            int b_nnz = B->p[j + 1] - B->p[j];
+
+            /* Compute dot product of row i of A and column j of B */
+            double sum = sparse_dot(A->x + A->p[i], A->i + A->p[i], a_nnz,
+                                    B->x + B->p[j], B->i + B->p[j], b_nnz);
+
+            C->x[jj] = sum;
+        }
+    }
+}
+
+/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p)
+  with precomputed sparsity pattern */
+CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
+{
+    int m = A->m;
+    int p = B->n;
+
+    int len_a, len_b;
+
+    int *Cp = (int *) malloc((m + 1) * sizeof(int));
+    iVec *Ci = iVec_new(m);
+
+    Cp[0] = 0;
+
+    // --------------------------------------------------------------
+    //            countz nnz and fill column indices
+    // --------------------------------------------------------------
+    int nnz = 0;
+    for (int i = 0; i < A->m; i++)
+    {
+        len_a = A->p[i + 1] - A->p[i];
+
+        for (int j = 0; j < B->n; j++)
+        {
+            len_b = B->p[j + 1] - B->p[j];
+
+            if (has_overlap(A->i + A->p[i], len_a, B->i + B->p[j], len_b))
+            {
+                iVec_append(Ci, j);
+                nnz++;
+            }
+        }
+
+        Cp[i + 1] = nnz;
+    }
+
+    CSR_Matrix *C = new_csr_matrix(m, p, nnz);
+    memcpy(C->p, Cp, (m + 1) * sizeof(int));
+    memcpy(C->i, Ci->data, nnz * sizeof(int));
+    free(Cp);
+    iVec_free(Ci);
+
+    return C;
+}
diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_Matrix.c
index 6b63c7e..2dbae1a 100644
--- a/src/utils/CSR_Matrix.c
+++ b/src/utils/CSR_Matrix.c
@@ -39,6 +39,76 @@ void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C)
     memcpy(C->x, A->x, A->nnz * sizeof(double));
 }
 
+CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
+{
+    assert(p > 0);
+
+    int m = A->m;
+    int n = A->n;
+    int nnz = A->nnz;
+
+    CSR_Matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+
+    int nnz_cursor = 0;
+    for (int block = 0; block < p; block++)
+    {
+        int row_offset = block * m;
+        int col_offset = block * n;
+
+        for (int row = 0; row < m; row++)
+        {
+            int dest_row = row_offset + row;
+            A_kron->p[dest_row] = nnz_cursor;
+
+            for (int j = A->p[row]; j < A->p[row + 1]; j++)
+            {
+                A_kron->i[nnz_cursor] = A->i[j] + col_offset;
+                A_kron->x[nnz_cursor] = A->x[j];
+                nnz_cursor++;
+            }
+
+            A_kron->p[dest_row + 1] = nnz_cursor;
+        }
+    }
+
+    return A_kron;
+}
+
+CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
+{
+    assert(p > 0);
+
+    int m = A->m;
+    int n = A->n;
+    int nnz = A->nnz;
+
+    CSR_Matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+
+    int nnz_cursor = 0;
+    for (int row_block = 0; row_block < m; row_block++)
+    {
+        for (int diag_idx = 0; diag_idx < p; diag_idx++)
+        {
+            int dest_row = row_block * p + diag_idx;
+            A_kron->p[dest_row] = nnz_cursor;
+
+            /* Copy entries from row_block of A, adjusting column indices */
+            for (int j = A->p[row_block]; j < A->p[row_block + 1]; j++)
+            {
+                int col_block = A->i[j];
+                /* Column in result: col_block * p + diag_idx */
+                A_kron->i[nnz_cursor] = col_block * p + diag_idx;
+                A_kron->x[nnz_cursor] = A->x[j];
+                nnz_cursor++;
+            }
+
+            A_kron->p[dest_row + 1] = nnz_cursor;
+        }
+    }
+
+    return A_kron;
+}
+
 void csr_matvec(const CSR_Matrix *A, const double *x, double *y, int col_offset)
 {
     for (int row = 0; row < A->m; row++)
@@ -52,6 +122,19 @@ void csr_matvec(const CSR_Matrix *A, const double *x, double *y, int col_offset)
     }
 }
 
+void csr_matvec_wo_offset(const CSR_Matrix *A, const double *x, double *y)
+{
+    for (int row = 0; row < A->m; row++)
+    {
+        double sum = 0.0;
+        for (int j = A->p[row]; j < A->p[row + 1]; j++)
+        {
+            sum += A->x[j] * x[A->i[j]];
+        }
+        y[row] = sum;
+    }
+}
+
 int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz)
 {
     for (int row = 0; row < A->m; row++)
diff --git a/tests/all_tests.c b/tests/all_tests.c
index f59d647..6486947 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -16,6 +16,7 @@
 #include "jacobian_tests/test_composite.h"
 #include "jacobian_tests/test_elementwise_mult.h"
 #include "jacobian_tests/test_hstack.h"
+#include "jacobian_tests/test_left_matmul.h"
 #include "jacobian_tests/test_log.h"
 #include "jacobian_tests/test_neg.h"
 #include "jacobian_tests/test_prod.h"
@@ -23,6 +24,7 @@
 #include "jacobian_tests/test_quad_form.h"
 #include "jacobian_tests/test_quad_over_lin.h"
 #include "jacobian_tests/test_rel_entr.h"
+#include "jacobian_tests/test_right_matmul.h"
 #include "jacobian_tests/test_sum.h"
 #include "jacobian_tests/test_trace.h"
 #include "problem/test_problem.h"
@@ -37,11 +39,13 @@
 #include "wsum_hess/elementwise/test_trig.h"
 #include "wsum_hess/elementwise/test_xexp.h"
 #include "wsum_hess/test_hstack.h"
+#include "wsum_hess/test_left_matmul.h"
 #include "wsum_hess/test_multiply.h"
 #include "wsum_hess/test_prod.h"
 #include "wsum_hess/test_quad_form.h"
 #include "wsum_hess/test_quad_over_lin.h"
 #include "wsum_hess/test_rel_entr.h"
+#include "wsum_hess/test_right_matmul.h"
 #include "wsum_hess/test_sum.h"
 #include "wsum_hess/test_trace.h"
 
@@ -105,6 +109,11 @@ int main(void)
     mu_run_test(test_wsum_hess_multiply_2, tests_run);
     mu_run_test(test_jacobian_trace_variable, tests_run);
     mu_run_test(test_jacobian_trace_composite, tests_run);
+    mu_run_test(test_jacobian_left_matmul_log, tests_run);
+    mu_run_test(test_jacobian_left_matmul_log_matrix, tests_run);
+    mu_run_test(test_jacobian_left_matmul_log_composite, tests_run);
+    mu_run_test(test_jacobian_right_matmul_log, tests_run);
+    mu_run_test(test_jacobian_right_matmul_log_vector, tests_run);
 
     printf("\n--- Weighted Sum of Hessian Tests ---\n");
     mu_run_test(test_wsum_hess_log, tests_run);
@@ -140,6 +149,11 @@ int main(void)
     mu_run_test(test_wsum_hess_multiply_sparse_random, tests_run);
     mu_run_test(test_wsum_hess_multiply_1, tests_run);
     mu_run_test(test_wsum_hess_multiply_2, tests_run);
+    mu_run_test(test_wsum_hess_left_matmul, tests_run);
+    mu_run_test(test_wsum_hess_left_matmul_matrix, tests_run);
+    mu_run_test(test_wsum_hess_left_matmul_composite, tests_run);
+    mu_run_test(test_wsum_hess_right_matmul, tests_run);
+    mu_run_test(test_wsum_hess_right_matmul_vector, tests_run);
     // This test leads to seg fault
     // mu_run_test(test_wsum_hess_trace_variable, tests_run);
 
@@ -153,6 +167,7 @@ int main(void)
     mu_run_test(test_csr_sum2, tests_run);
     mu_run_test(test_transpose, tests_run);
     mu_run_test(test_AT_alloc_and_fill, tests_run);
+    mu_run_test(test_kron_identity_csr, tests_run);
     mu_run_test(test_csr_to_csc1, tests_run);
     mu_run_test(test_csr_to_csc2, tests_run);
     mu_run_test(test_csr_vecmat_values_sparse, tests_run);
diff --git a/tests/jacobian_tests/test_left_matmul.h b/tests/jacobian_tests/test_left_matmul.h
new file mode 100644
index 0000000..09557ca
--- /dev/null
+++ b/tests/jacobian_tests/test_left_matmul.h
@@ -0,0 +1,183 @@
+#include <math.h>
+#include <stdio.h>
+
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_jacobian_left_matmul_log()
+{
+    /* Test Jacobian of A @ log(x) where:
+     * x is 3x1 variable at x = [1, 2, 3]
+     * A is 4x3 sparse matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     * Output: A @ log(x) is 4x1
+     *
+     * Jacobian is d(A @ log(x))/dx = A @ diag(1/x)
+     * At x = [1, 2, 3], this is:
+     * [1,   0, 2/3  ]
+     * [3,   0, 4/3  ]
+     * [5,   0, 2    ]
+     * [7,   0, 0    ]
+     *
+     * Stored in CSR format (4x3 sparse):
+     * nnz = 7
+     * p = [0, 2, 4, 6, 7]
+     * i = [0, 2, 0, 2, 0, 2, 0]
+     * x = [1.0, 2.0/3.0, 3.0, 4.0/3.0, 5.0, 2.0, 7.0]
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    expr *x = new_variable(3, 1, 0, 3);
+
+    /* Create sparse matrix A in CSR format */
+    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    int A_p[5] = {0, 2, 4, 6, 7};
+    int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
+    double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->p, A_p, 5 * sizeof(int));
+    memcpy(A->i, A_i, 7 * sizeof(int));
+    memcpy(A->x, A_x, 7 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_left_matmul(log_x, A);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->eval_jacobian(A_log_x);
+
+    /* Expected jacobian values: A @ diag(1/x) */
+    double expected_Ax[7] = {
+        1.0,       /* row 0, col 0: 1 * (1/1) */
+        2.0 / 3.0, /* row 0, col 2: 2 * (1/3) */
+        3.0,       /* row 1, col 0: 3 * (1/1) */
+        4.0 / 3.0, /* row 1, col 2: 4 * (1/3) */
+        5.0,       /* row 2, col 0: 5 * (1/1) */
+        2.0,       /* row 2, col 2: 6 * (1/3) */
+        7.0        /* row 3, col 0: 7 * (1/1) */
+    };
+    int expected_Ai[7] = {0, 2, 0, 2, 0, 2, 0};
+    int expected_Ap[5] = {0, 2, 4, 6, 7};
+
+    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 7));
+    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->i, expected_Ai, 7));
+    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->p, expected_Ap, 5));
+
+    free_csr_matrix(A);
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_jacobian_left_matmul_log_matrix()
+{
+    /* x is 3x2, vectorized column-wise: [1,2,3 | 4,5,6] */
+    double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    expr *x = new_variable(3, 2, 0, 6);
+
+    /* Create sparse matrix A in CSR format (4x3) */
+    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    int A_p[5] = {0, 2, 4, 6, 7};
+    int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
+    double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->p, A_p, 5 * sizeof(int));
+    memcpy(A->i, A_i, 7 * sizeof(int));
+    memcpy(A->x, A_x, 7 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_left_matmul(log_x, A);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->eval_jacobian(A_log_x);
+
+    /* Expected Jacobian: block-diagonal repeat of A scaled by diag(1./x) */
+    double expected_Ax[14] = {/* first column block (x = [1, 2, 3]) */
+                              1.0, 2.0 / 3.0, 3.0, 4.0 / 3.0, 5.0, 2.0, 7.0,
+                              /* second column block (x = [4, 5, 6]) */
+                              0.25, 1.0 / 3.0, 0.75, 2.0 / 3.0, 1.25, 1.0, 1.75};
+    int expected_Ai[14] = {0, 2, 0, 2, 0, 2, 0, 3, 5, 3, 5, 3, 5, 3};
+    int expected_Ap[9] = {0, 2, 4, 6, 7, 9, 11, 13, 14};
+
+    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 14));
+    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->i, expected_Ai, 14));
+    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->p, expected_Ap, 9));
+
+    free_csr_matrix(A);
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_jacobian_left_matmul_log_composite()
+{
+    /* Test Jacobian of A @ log(B @ x) where:
+     * x is 3x1 variable at x = [1, 2, 3]
+     * B is 3x3 dense matrix of all ones
+     * A is 4x3 sparse matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     *
+     * B @ x = [1+2+3, 1+2+3, 1+2+3]^T = [6, 6, 6]^T
+     * log(B @ x) = [log(6), log(6), log(6)]^T
+     * A @ log(B @ x) is 4x1
+     *
+     * Jacobian is d(A @ log(B @ x))/dx = A @ diag(1/(B@x)) @ B
+     * Where B@x = [6, 6, 6], so diag(1/(B@x)) = diag([1/6, 1/6, 1/6])
+     * diag(1/(B@x)) @ B gives [1/6, 1/6, 1/6; 1/6, 1/6, 1/6; 1/6, 1/6, 1/6]
+     * A @ (diag(1/(B@x)) @ B) is 4x3 dense (all entries nonzero):
+     * Row 0: [1, 0, 2] @ [1/6, 1/6, 1/6; ...] = [1/6 + 2/6, ...] = [3/6, 3/6, 3/6] =
+     * [1/2, 1/2, 1/2] Row 1: [3, 0, 4] @ [...] = [3/6 + 4/6, ...] = [7/6, 7/6, 7/6]
+     * Row 2: [5, 0, 6] @ [...] = [5/6 + 6/6, ...] = [11/6, 11/6, 11/6]
+     * Row 3: [7, 0, 0] @ [...] = [7/6, 7/6, 7/6]
+     *
+     * Stored in CSR format (4x3 all dense):
+     * nnz = 12
+     * p = [0, 3, 6, 9, 12]
+     * i = [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
+     * x = [1/2, 1/2, 1/2, 7/6, 7/6, 7/6, 11/6, 11/6, 11/6, 7/6, 7/6, 7/6]
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    expr *x = new_variable(3, 1, 0, 3);
+
+    /* Create B matrix (3x3 all ones) */
+    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    int B_p[4] = {0, 3, 6, 9};
+    int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    memcpy(B->p, B_p, 4 * sizeof(int));
+    memcpy(B->i, B_i, 9 * sizeof(int));
+    memcpy(B->x, B_x, 9 * sizeof(double));
+
+    /* Create A matrix */
+    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    int A_p[5] = {0, 2, 4, 6, 7};
+    int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
+    double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->p, A_p, 5 * sizeof(int));
+    memcpy(A->i, A_i, 7 * sizeof(int));
+    memcpy(A->x, A_x, 7 * sizeof(double));
+
+    expr *Bx = new_linear(x, B);
+    expr *log_Bx = new_log(Bx);
+    expr *A_log_Bx = new_left_matmul(log_Bx, A);
+
+    A_log_Bx->forward(A_log_Bx, x_vals);
+    A_log_Bx->jacobian_init(A_log_Bx);
+    A_log_Bx->eval_jacobian(A_log_Bx);
+
+    /* Expected jacobian values: A @ diag(1/(B@x)) @ B */
+    double expected_Ax[12] = {
+        0.5,        0.5,        0.5,        /* row 0: [1/2, 1/2, 1/2] */
+        7.0 / 6.0,  7.0 / 6.0,  7.0 / 6.0,  /* row 1: [7/6, 7/6, 7/6] */
+        11.0 / 6.0, 11.0 / 6.0, 11.0 / 6.0, /* row 2: [11/6, 11/6, 11/6] */
+        7.0 / 6.0,  7.0 / 6.0,  7.0 / 6.0   /* row 3: [7/6, 7/6, 7/6] */
+    };
+    int expected_Ai[12] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
+    int expected_Ap[5] = {0, 3, 6, 9, 12};
+
+    mu_assert("vals fail", cmp_double_array(A_log_Bx->jacobian->x, expected_Ax, 12));
+    mu_assert("cols fail", cmp_int_array(A_log_Bx->jacobian->i, expected_Ai, 12));
+    mu_assert("rows fail", cmp_int_array(A_log_Bx->jacobian->p, expected_Ap, 5));
+
+    free_csr_matrix(A);
+    free_csr_matrix(B);
+    free_expr(A_log_Bx);
+    return 0;
+}
diff --git a/tests/jacobian_tests/test_right_matmul.h b/tests/jacobian_tests/test_right_matmul.h
new file mode 100644
index 0000000..931df44
--- /dev/null
+++ b/tests/jacobian_tests/test_right_matmul.h
@@ -0,0 +1,102 @@
+#include <math.h>
+#include <stdio.h>
+
+#include "affine.h"
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_jacobian_right_matmul_log()
+{
+    /* Test Jacobian of log(x) @ A where:
+     * x is 2x2 variable at x = [[1, 2], [3, 4]] (vectorized column-wise: [1, 3, 2,
+     * 4]) A is 2x3 sparse matrix [1, 0, 2; 3, 0, 4] Output: log(x) @ A is 2x3
+     */
+    double x_vals[4] = {1.0, 3.0, 2.0, 4.0}; // column-wise vectorization
+    expr *x = new_variable(2, 2, 0, 4);
+
+    /* Create sparse matrix A in CSR format (2x3) */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    int A_p[3] = {0, 2, 4};
+    int A_i[4] = {0, 2, 0, 2};
+    double A_x[4] = {1.0, 2.0, 3.0, 4.0};
+    memcpy(A->p, A_p, 3 * sizeof(int));
+    memcpy(A->i, A_i, 4 * sizeof(int));
+    memcpy(A->x, A_x, 4 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *log_x_A = new_right_matmul(log_x, A);
+
+    log_x_A->forward(log_x_A, x_vals);
+    log_x_A->jacobian_init(log_x_A);
+    log_x_A->eval_jacobian(log_x_A);
+
+    /* Expected jacobian values */
+    double expected_Ax[8] = {
+        1.0,       /* row 0, col 0 */
+        1.5,       /* row 0, col 2 */
+        1.0 / 3.0, /* row 1, col 1 */
+        0.75,      /* row 1, col 3 */
+        2.0,       /* row 4, col 0 */
+        2.0,       /* row 4, col 2 */
+        2.0 / 3.0, /* row 5, col 1 */
+        1.0        /* row 5, col 3 */
+    };
+    int expected_Ai[8] = {0, 2, 1, 3, 0, 2, 1, 3};
+    int expected_Ap[7] = {0, 2, 4, 4, 4, 6, 8};
+
+    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 8));
+    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->i, expected_Ai, 8));
+    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->p, expected_Ap, 7));
+
+    free_csr_matrix(A);
+    free_expr(log_x_A);
+    return 0;
+}
+
+const char *test_jacobian_right_matmul_log_vector()
+{
+    /* Test Jacobian of log(x) @ A where:
+     * x is 1x3 variable at x = [1, 2, 3]
+     * A is 3x2 sparse matrix [[1, 0], [2, 3], [0, 4]]
+     * Output: log(x) @ A is 1x2
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    expr *x = new_variable(1, 3, 0, 3);
+
+    /* Create sparse matrix A in CSR format (3x2) */
+    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    int A_p[4] = {0, 1, 3, 4};
+    int A_i[4] = {0, 0, 1, 1};
+    double A_x[4] = {1.0, 2.0, 3.0, 4.0};
+    memcpy(A->p, A_p, 4 * sizeof(int));
+    memcpy(A->i, A_i, 4 * sizeof(int));
+    memcpy(A->x, A_x, 4 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *log_x_A = new_right_matmul(log_x, A);
+
+    log_x_A->forward(log_x_A, x_vals);
+    log_x_A->jacobian_init(log_x_A);
+    log_x_A->eval_jacobian(log_x_A);
+
+    /* Expected jacobian values: A^T @ diag(1/x) */
+    double expected_Ax[4] = {
+        1.0,      /* row 0, col 0 */
+        1.0,      /* row 0, col 1 */
+        1.5,      /* row 1, col 1 */
+        4.0 / 3.0 /* row 1, col 2 */
+    };
+    int expected_Ai[4] = {0, 1, 1, 2};
+    int expected_Ap[3] = {0, 2, 4};
+
+    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 4));
+    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->i, expected_Ai, 4));
+    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->p, expected_Ap, 3));
+
+    free_csr_matrix(A);
+    free_expr(log_x_A);
+    return 0;
+}
diff --git a/tests/utils/test_csr_matrix.h b/tests/utils/test_csr_matrix.h
index b2cfced..7a42970 100644
--- a/tests/utils/test_csr_matrix.h
+++ b/tests/utils/test_csr_matrix.h
@@ -435,3 +435,50 @@ const char *test_AT_alloc_and_fill()
 
     return 0;
 }
+
+const char *test_kron_identity_csr()
+{
+    /* Test A kron I_p where:
+     * A is 2x3:
+     * [1  0  2]
+     * [3  0  4]
+     *
+     * p = 2, so result should be 4x6 (2*2 rows, 3*2 cols)
+     * A kron I_2 =
+     * [1  0 | 0  0 | 2  0]
+     * [0  1 | 0  0 | 0  2]
+     * [-----|-----|-----]
+     * [3  0 | 0  0 | 4  0]
+     * [0  3 | 0  0 | 0  4]
+     */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    double Ax[4] = {1.0, 2.0, 3.0, 4.0};
+    int Ai[4] = {0, 2, 0, 2};
+    int Ap[3] = {0, 2, 4};
+    memcpy(A->x, Ax, 4 * sizeof(double));
+    memcpy(A->i, Ai, 4 * sizeof(int));
+    memcpy(A->p, Ap, 3 * sizeof(int));
+
+    CSR_Matrix *result = kron_identity_csr(A, 2);
+
+    /* Expected: 4x6 with 8 nonzeros
+     * Row 0: [1, 0, 0, 0, 2, 0] -> cols {0, 4}, vals {1, 2}
+     * Row 1: [0, 1, 0, 0, 0, 2] -> cols {1, 5}, vals {1, 2}
+     * Row 2: [3, 0, 0, 0, 4, 0] -> cols {0, 4}, vals {3, 4}
+     * Row 3: [0, 3, 0, 0, 0, 4] -> cols {1, 5}, vals {3, 4}
+     */
+    double expected_x[8] = {1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 4.0};
+    int expected_i[8] = {0, 4, 1, 5, 0, 4, 1, 5};
+    int expected_p[5] = {0, 2, 4, 6, 8};
+
+    mu_assert("dimensions incorrect", result->m == 4 && result->n == 6);
+    mu_assert("nnz incorrect", result->nnz == 8);
+    mu_assert("vals incorrect", cmp_double_array(result->x, expected_x, 8));
+    mu_assert("cols incorrect", cmp_int_array(result->i, expected_i, 8));
+    mu_assert("rows incorrect", cmp_int_array(result->p, expected_p, 5));
+
+    free_csr_matrix(A);
+    free_csr_matrix(result);
+
+    return 0;
+}
diff --git a/tests/wsum_hess/test_left_matmul.h b/tests/wsum_hess/test_left_matmul.h
new file mode 100644
index 0000000..b1f0923
--- /dev/null
+++ b/tests/wsum_hess/test_left_matmul.h
@@ -0,0 +1,263 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "affine.h"
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_wsum_hess_left_matmul()
+{
+    /* Test weighted sum of Hessian of A @ log(x) where:
+     * x is 3x1 variable at x = [1, 2, 3]
+     * A is 4x3 sparse matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     * Output: A @ log(x) is 4x1
+     * Weights w = [1, 2, 3, 4]
+     *
+     * For log(x_i), the Hessian is diagonal with h_ii = -1/x_i^2
+     * The output f = A @ log(x) has:
+     * f[0] = log(x[0]) + 2*log(x[2])
+     * f[1] = 3*log(x[0]) + 4*log(x[2])
+     * f[2] = 5*log(x[0]) + 6*log(x[2])
+     * f[3] = 7*log(x[0])
+     *
+     * Weighted sum of Hessian: sum_k w[k] * d²f[k]/dx²
+     * For each variable x[i], we need to find which outputs f[k] depend on it.
+     *
+     * x[0] affects f[0], f[1], f[2], f[3] with coefficients 1, 3, 5, 7
+     * x[1] doesn't affect any output (column 1 of A is all zeros)
+     * x[2] affects f[0], f[1], f[2] with coefficients 2, 4, 6
+     *
+     * d²f[k]/dx[i]² = A[k,i] * (-1/x[i]²)
+     *
+     * wsum_hess[i,i] = sum_k w[k] * A[k,i] * (-1/x[i]²)
+     *                = (-1/x[i]²) * sum_k w[k] * A[k,i]
+     *                = (-1/x[i]²) * (A^T @ w)[i]
+     *
+     * A^T @ w:
+     * [1, 3, 5, 7] @ [1, 2, 3, 4] = 1*1 + 3*2 + 5*3 + 7*4 = 1 + 6 + 15 + 28 = 50
+     * [0, 0, 0, 0] @ [1, 2, 3, 4] = 0
+     * [2, 4, 6, 0] @ [1, 2, 3, 4] = 2*1 + 4*2 + 6*3 = 2 + 8 + 18 = 28
+     *
+     * wsum_hess[0,0] = -50 / 1² = -50
+     * wsum_hess[1,1] = 0
+     * wsum_hess[2,2] = -28 / 3² = -28/9
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    double w[4] = {1.0, 2.0, 3.0, 4.0};
+
+    expr *x = new_variable(3, 1, 0, 3);
+
+    /* Create sparse matrix A in CSR format */
+    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    int A_p[5] = {0, 2, 4, 6, 7};
+    int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
+    double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->p, A_p, 5 * sizeof(int));
+    memcpy(A->i, A_i, 7 * sizeof(int));
+    memcpy(A->x, A_x, 7 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_left_matmul(log_x, A);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->wsum_hess_init(A_log_x);
+    A_log_x->eval_wsum_hess(A_log_x, w);
+
+    /* Expected wsum_hess: diagonal matrix with all 3 entries
+     * (sparsity matches child's diagonal Hessian) */
+    double expected_x[3] = {
+        -50.0,      /* position [0,0]: -50 / 1² */
+        -0.0,       /* position [1,1]: 0 * (-1/4) = 0 */
+        -28.0 / 9.0 /* position [2,2]: -28 / 9 */
+    };
+    int expected_i[3] = {0, 1, 2};
+    int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
+
+    mu_assert("vals incorrect",
+              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3));
+    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->i, expected_i, 3));
+    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->p, expected_p, 4));
+
+    free_csr_matrix(A);
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_wsum_hess_left_matmul_composite()
+{
+    /* Test weighted sum of Hessian of A @ log(B @ x) where:
+     * x is 3x1 variable at x = [1, 2, 3]
+     * B is 3x3 dense matrix of all ones
+     * A is 4x3 sparse matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     * Weights w = [1, 2, 3, 4]
+     *
+     * B @ x = [6, 6, 6]^T
+     * log(B @ x) = [log(6), log(6), log(6)]^T
+     * f = A @ log(B @ x) is 4x1
+     *
+     * The Hessian is more complex due to the composite chain rule.
+     * df/dx = A @ diag(1/(B@x)) @ B
+     *
+     * The second derivative involves:
+     * d²f/dx² = A @ d(diag(1/y))/dy @ dy/dx @ B + A @ diag(1/y) @ 0
+     * where y = B @ x
+     *
+     * d(diag(1/y))/dy = -diag(1/y²)
+     * dy/dx = B
+     *
+     * So: d²f[k]/dx² = -sum_j sum_i A[k,i] * (1/(B@x)_i²) * B[i,j] * B[i,l]
+     * This is dense and symmetric.
+     *
+     * For our case with B = all ones, B@x = [6,6,6], so 1/(B@x)² = 1/36 for each
+     * element
+     *
+     * For wsum_hess = sum_k w_k * d²f_k/dx²
+     * We use the chain rule: weight w goes through A
+     * weighted_w_prime[i] = sum_k w_k * A[k,i]
+     * Then: wsum_hess[i,l] = -weighted_w_prime[i] * (1/36) * B[i,j] * (same for l)
+     *
+     * A^T @ w = [50, 0, 28]^T (from A^T w)
+     * So weighted contribution for each element of B@x is -[50, 0, 28] / 36
+     *
+     * The result is a 3x3 matrix (outer product of weighted_w_prime with B):
+     * wsum_hess = -diag([50/36, 0, 28/36]) @ B @ B^T = -diag([50/36, 0, 28/36]) @
+     * (all ones) = [-50/36, -50/36, -50/36; 0,      0,      0; -28/36, -28/36,
+     * -28/36] = [-25/18, -25/18, -25/18; 0,      0,      0; -7/9,   -7/9,   -7/9]
+     *
+     * Stored as dense 3x3:
+     * nnz = 6 (zeros in row 1)
+     * p = [0, 3, 3, 6]
+     * i = [0, 1, 2, 0, 1, 2]
+     * x = [-25/18, -25/18, -25/18, -7/9, -7/9, -7/9]
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    double w[4] = {1.0, 2.0, 3.0, 4.0};
+
+    expr *x = new_variable(3, 1, 0, 3);
+
+    /* Create B matrix (3x3 all ones) */
+    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    int B_p[4] = {0, 3, 6, 9};
+    int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    memcpy(B->p, B_p, 4 * sizeof(int));
+    memcpy(B->i, B_i, 9 * sizeof(int));
+    memcpy(B->x, B_x, 9 * sizeof(double));
+
+    /* Create A matrix */
+    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    int A_p[5] = {0, 2, 4, 6, 7};
+    int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
+    double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->p, A_p, 5 * sizeof(int));
+    memcpy(A->i, A_i, 7 * sizeof(int));
+    memcpy(A->x, A_x, 7 * sizeof(double));
+
+    expr *Bx = new_linear(x, B);
+    expr *log_Bx = new_log(Bx);
+    expr *A_log_Bx = new_left_matmul(log_Bx, A);
+
+    A_log_Bx->forward(A_log_Bx, x_vals);
+    A_log_Bx->jacobian_init(A_log_Bx);
+    A_log_Bx->wsum_hess_init(A_log_Bx);
+    A_log_Bx->eval_wsum_hess(A_log_Bx, w);
+
+    /* Expected wsum_hess: 3x3 dense matrix (9 nonzeros)
+     * All entries are -13/6 because the hessian of log(B@x) w.r.t. x
+     * becomes dense when B is dense (all ones) */
+    double expected_x[9] = {
+        -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0, /* row 0 */
+        -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0, /* row 1 */
+        -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0  /* row 2 */
+    };
+    int expected_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    int expected_p[4] = {0, 3, 6, 9}; /* each row has 3 entries */
+
+    mu_assert("vals incorrect",
+              cmp_double_array(A_log_Bx->wsum_hess->x, expected_x, 9));
+    mu_assert("cols incorrect",
+              cmp_int_array(A_log_Bx->wsum_hess->i, expected_i, 9));
+    mu_assert("rows incorrect",
+              cmp_int_array(A_log_Bx->wsum_hess->p, expected_p, 4));
+
+    free_csr_matrix(A);
+    free_csr_matrix(B);
+    free_expr(A_log_Bx);
+    return 0;
+}
+
+const char *test_wsum_hess_left_matmul_matrix()
+{
+    /* Test weighted sum of Hessian of A @ log(x) where:
+     * x is 3x2 variable, vectorized column-wise: [1, 2, 3, 4, 5, 6]
+     * A is 4x3 sparse matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     * Output: A @ log(x) is 4x2, vectorized to 8x1
+     * Weights w = [1, 2, 3, 4, 5, 6, 7, 8]
+     *
+     * The operation is block-diagonal:
+     * - Column 0 of x: [1, 2, 3] with weights w[0:4] = [1, 2, 3, 4]
+     * - Column 1 of x: [4, 5, 6] with weights w[4:8] = [5, 6, 7, 8]
+     *
+     * For column 0 (variables x[0:3]):
+     * A^T @ w[0:4] = [1*1 + 3*2 + 5*3 + 7*4, 0, 2*1 + 4*2 + 6*3]
+     *              = [50, 0, 28]
+     * wsum_hess[0,0] = -50 / 1² = -50
+     * wsum_hess[1,1] = 0
+     * wsum_hess[2,2] = -28 / 3² = -28/9
+     *
+     * For column 1 (variables x[3:6]):
+     * A^T @ w[4:8] = [1*5 + 3*6 + 5*7 + 7*8, 0, 2*5 + 4*6 + 6*7]
+     *              = [114, 0, 76]
+     * wsum_hess[3,3] = -114 / 4² = -114/16 = -57/8
+     * wsum_hess[4,4] = 0
+     * wsum_hess[5,5] = -76 / 6² = -76/36 = -19/9
+     */
+    double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double w[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+
+    expr *x = new_variable(3, 2, 0, 6);
+
+    /* Create sparse matrix A in CSR format */
+    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    int A_p[5] = {0, 2, 4, 6, 7};
+    int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
+    double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->p, A_p, 5 * sizeof(int));
+    memcpy(A->i, A_i, 7 * sizeof(int));
+    memcpy(A->x, A_x, 7 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_left_matmul(log_x, A);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->wsum_hess_init(A_log_x);
+    A_log_x->eval_wsum_hess(A_log_x, w);
+
+    /* Expected wsum_hess: 6x6 diagonal matrix with all 6 entries */
+    double expected_x[6] = {
+        -50.0,       /* position [0,0]: column 0, variable 0 */
+        -0.0,        /* position [1,1]: column 0, variable 1 */
+        -28.0 / 9.0, /* position [2,2]: column 0, variable 2 */
+        -57.0 / 8.0, /* position [3,3]: column 1, variable 0 */
+        -0.0,        /* position [4,4]: column 1, variable 1 */
+        -19.0 / 9.0  /* position [5,5]: column 1, variable 2 */
+    };
+    int expected_i[6] = {0, 1, 2, 3, 4, 5};
+    int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; /* each row has 1 diagonal entry */
+
+    mu_assert("vals incorrect",
+              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6));
+    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->i, expected_i, 6));
+    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->p, expected_p, 7));
+
+    free_csr_matrix(A);
+    free_expr(A_log_x);
+    return 0;
+}
diff --git a/tests/wsum_hess/test_right_matmul.h b/tests/wsum_hess/test_right_matmul.h
new file mode 100644
index 0000000..ca109f8
--- /dev/null
+++ b/tests/wsum_hess/test_right_matmul.h
@@ -0,0 +1,110 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "affine.h"
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_wsum_hess_right_matmul()
+{
+    /* Test weighted sum of Hessian of log(x) @ A where:
+     * x is 2x2 variable at x = [[1, 2], [3, 4]] (vectorized column-wise: [1, 3, 2,
+     * 4]) A is 2x3 sparse matrix [[1, 0, 2], [3, 0, 4]] Output: log(x) @ A is 2x3
+     * Weights w is 2x3 (vectorized column-wise: w = [w00, w10, w01, w11, w02, w12])
+     *   = [1, 2, 3, 4, 5, 6]
+     */
+    double x_vals[4] = {1.0, 3.0, 2.0, 4.0};      // column-wise vectorization
+    double w[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; // weights for 2x3 output
+
+    expr *x = new_variable(2, 2, 0, 4);
+
+    /* Create sparse matrix A in CSR format (2x3) */
+    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    int A_p[3] = {0, 2, 4};
+    int A_i[4] = {0, 2, 0, 2};
+    double A_x[4] = {1.0, 2.0, 3.0, 4.0};
+    memcpy(A->p, A_p, 3 * sizeof(int));
+    memcpy(A->i, A_i, 4 * sizeof(int));
+    memcpy(A->x, A_x, 4 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *log_x_A = new_right_matmul(log_x, A);
+
+    log_x_A->forward(log_x_A, x_vals);
+    log_x_A->jacobian_init(log_x_A);
+    log_x_A->wsum_hess_init(log_x_A);
+    log_x_A->eval_wsum_hess(log_x_A, w);
+
+    /* Expected wsum_hess: diagonal matrix with 4 entries */
+    double expected_x[4] = {
+        -11.0,       /* position [0,0]: -11 */
+        -14.0 / 9.0, /* position [1,1]: -14/9 */
+        -23.0 / 4.0, /* position [2,2]: -23/4 */
+        -15.0 / 8.0  /* position [3,3]: -15/8 */
+    };
+    int expected_i[4] = {0, 1, 2, 3};
+    int expected_p[5] = {0, 1, 2, 3, 4}; /* each row has 1 diagonal entry */
+
+    mu_assert("vals incorrect",
+              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 4));
+    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->i, expected_i, 4));
+    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->p, expected_p, 5));
+
+    free_csr_matrix(A);
+    free_expr(log_x_A);
+    return 0;
+}
+
+const char *test_wsum_hess_right_matmul_vector()
+{
+    /* Test weighted sum of Hessian of log(x) @ A where:
+     * x is 1x3 variable at x = [1, 2, 3]
+     * A is 3x2 sparse matrix [[1, 0], [2, 3], [0, 4]]
+     * Output: log(x) @ A is 1x2
+     * Weights w is 1x2 (vectorized: w = [w0, w1] = [1, 2])
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    double w[2] = {1.0, 2.0}; // weights for 1x2 output
+
+    expr *x = new_variable(1, 3, 0, 3);
+
+    /* Create sparse matrix A in CSR format (3x2) */
+    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    int A_p[4] = {0, 1, 3, 4};
+    int A_i[4] = {0, 0, 1, 1};
+    double A_x[4] = {1.0, 2.0, 3.0, 4.0};
+    memcpy(A->p, A_p, 4 * sizeof(int));
+    memcpy(A->i, A_i, 4 * sizeof(int));
+    memcpy(A->x, A_x, 4 * sizeof(double));
+
+    expr *log_x = new_log(x);
+    expr *log_x_A = new_right_matmul(log_x, A);
+
+    log_x_A->forward(log_x_A, x_vals);
+    log_x_A->jacobian_init(log_x_A);
+    log_x_A->wsum_hess_init(log_x_A);
+    log_x_A->eval_wsum_hess(log_x_A, w);
+
+    /* Expected wsum_hess: diagonal matrix with 3 entries */
+    double expected_x[3] = {
+        -1.0,      /* position [0,0]: -1 */
+        -2.0,      /* position [1,1]: -2 */
+        -8.0 / 9.0 /* position [2,2]: -8/9 */
+    };
+    int expected_i[3] = {0, 1, 2};
+    int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
+
+    mu_assert("vals incorrect",
+              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 3));
+    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->i, expected_i, 3));
+    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->p, expected_p, 4));
+
+    free_csr_matrix(A);
+    free_expr(log_x_A);
+    return 0;
+}