diff --git a/README.md b/README.md
index d1d7b1f..b32b769 100644
--- a/README.md
+++ b/README.md
@@ -2,4 +2,13 @@
 4. in the refactor, add consts
 5. multiply with one constant vector/scalar argument
 6. AX where X is a matrix. Can that happen? How is that canonicalized? Maybe it can't happen.
-7. Must be able to compute jacobian and hessian of A @ phi(x), so linear operator needs other code! This requires new infrastructure, I think.
\ No newline at end of file
+7. Must be able to compute jacobian and hessian of A @ phi(x), so linear operator needs other code! This requires new infrastructure, I think.
+8. Shortcut hessians of affine stuff?
+
+Going through all atoms to see that sparsity pattern is computed in initialization of jacobian:
+2. trace - not ok
+
+Going through all atoms to see that sparsity pattern is computed in initialization of hessian:
+2. hstack - not ok
+3. trace - not ok
+
diff --git a/include/affine.h b/include/affine.h
index e1b66ac..12b1dab 100644
--- a/include/affine.h
+++ b/include/affine.h
@@ -11,6 +11,7 @@ expr *new_add(expr *left, expr *right);
 
 expr *new_sum(expr *child, int axis);
 expr *new_hstack(expr **args, int n_args, int n_vars);
+expr *new_trace(expr *child);
 
 expr *new_constant(int d1, int d2, int n_vars, const double *values);
 expr *new_variable(int d1, int d2, int var_id, int n_vars);
diff --git a/include/expr.h b/include/expr.h
index f6381a3..56173b9 100644
--- a/include/expr.h
+++ b/include/expr.h
@@ -5,6 +5,7 @@
 #include "utils/CSR_Matrix.h"
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 
 #define JAC_IDXS_NOT_SET -1
 #define NOT_A_VARIABLE -1
@@ -53,6 +54,9 @@ typedef struct expr
     local_wsum_hess_fn local_wsum_hess; /* used by elementwise univariate atoms*/
     free_type_data_fn free_type_data;   /* Cleanup for type-specific fields */
 
+    // name of node just for debugging - should be removed later
+    char name[32];
+
 } expr;
 
 void init_expr(expr *node, int d1, int d2, int n_vars, forward_fn forward,
diff --git a/include/subexpr.h b/include/subexpr.h
index 4b29416..1b6d6a2 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -37,9 +37,16 @@ typedef struct sum_expr
 {
     expr base;
     int axis;
-    struct int_double_pair *int_double_pairs; /* for sorting jacobian entries */
+    int *idx_map; /* maps child nnz to summed-row positions */
 } sum_expr;
 
+/* Trace-like reduction: sums entries spaced by child->d1 */
+typedef struct trace_expr
+{
+    expr base;
+    struct int_double_pair *int_double_pairs; /* for sorting jacobian entries */
+} trace_expr;
+
 /* Product of all entries */
 typedef struct prod_expr
 {
diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_Matrix.h
index ca4b0a2..aa4d7ab 100644
--- a/include/utils/CSR_Matrix.h
+++ b/include/utils/CSR_Matrix.h
@@ -52,29 +52,86 @@ void csr_insert_value(CSR_Matrix *A, int col_idx, double value);
  * d must have length m
  * C must be pre-allocated with same dimensions as A */
 void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
+void diag_csr_mult_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
 
 /* Compute C = A + B where A, B, C are CSR matrices
  * A and B must have same dimensions
  * C must be pre-allocated with sufficient nnz capacity.
  * C must be different from A and B */
 void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+/* Compute sparsity pattern of A + B where A, B, C are CSR matrices.
+ * Fills C->p, C->i, and C->nnz; does not touch C->x. */
+void sum_csr_matrices_fill_sparsity(const CSR_Matrix *A, const CSR_Matrix *B,
+                                    CSR_Matrix *C);
+
+/* Fill only the values of C = A + B, assuming C's sparsity pattern (p and i)
+ * is already filled and matches the union of A and B per row. Does not modify
+ * C->p, C->i, or C->nnz. */
+void sum_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
+                                  CSR_Matrix *C);
 
 /* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR matrices */
 void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
                              const double *d1, const double *d2);
 
+/* Fill only the values of C = diag(d1) * A + diag(d2) * B, assuming C's sparsity
+ * pattern (p and i) is already filled and matches the union of A and B per row.
+ * Does not modify C->p, C->i, or C->nnz. */
+void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
+                                         CSR_Matrix *C, const double *d1,
+                                         const double *d2);
+
 /* Sum all rows of A into a single row matrix C */
 void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
                       struct int_double_pair *pairs);
 
+/* iwork must have size max(C->n, A->nnz), and idx_map must have size A->nnz. */
+void sum_all_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A, CSR_Matrix *C,
+                                                int *iwork, int *idx_map);
+
+/* Fill values of summed rows using precomputed idx_map and sparsity of C */
+// void sum_all_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+//                                  const int *idx_map);
+
+/* Fill accumulator for summing rows using precomputed idx_map for each nnz of A*/
+void idx_map_accumulator(const CSR_Matrix *A, const int *idx_map,
+                         double *accumulator);
+
 /* Sum blocks of rows of A into a matrix C */
 void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
                            struct int_double_pair *pairs, int row_block_size);
 
+/* Build sparsity and index map for summing blocks of rows.
+ * iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz. */
+void sum_block_of_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
+                                                     CSR_Matrix *C,
+                                                     int row_block_size, int *iwork,
+                                                     int *idx_map);
+
+/* Fill values for summing blocks of rows using precomputed idx_map */
+// void sum_block_of_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+//                                        const int *idx_map);
+
 /* Sum evenly spaced rows of A into a matrix C */
 void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
                                 struct int_double_pair *pairs, int row_spacing);
 
+/* Build sparsity and index map for summing evenly spaced rows.
+ * iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz. */
+void sum_evenly_spaced_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
+                                                          CSR_Matrix *C,
+                                                          int row_spacing,
+                                                          int *iwork, int *idx_map);
+
+/* Fill values for summing evenly spaced rows using precomputed idx_map */
+// void sum_evenly_spaced_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+//                                             const int *idx_map);
+
+/* Sum evenly spaced rows of A starting at offset into a row matrix C */
+void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
+                                  struct int_double_pair *pairs, int offset,
+                                  int spacing);
+
 /* Count number of columns with nonzero entries */
 int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz);
 
diff --git a/src/affine/add.c b/src/affine/add.c
index 9f6b2d1..d7994b3 100644
--- a/src/affine/add.c
+++ b/src/affine/add.c
@@ -22,6 +22,10 @@ static void jacobian_init(expr *node)
     /* we never have to store more than the sum of children's nnz */
     int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
     node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
+
+    /* fill sparsity pattern  */
+    sum_csr_matrices_fill_sparsity(node->left->jacobian, node->right->jacobian,
+                                   node->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -31,7 +35,8 @@ static void eval_jacobian(expr *node)
     node->right->eval_jacobian(node->right);
 
     /* sum children's jacobians */
-    sum_csr_matrices(node->left->jacobian, node->right->jacobian, node->jacobian);
+    sum_csr_matrices_fill_values(node->left->jacobian, node->right->jacobian,
+                                 node->jacobian);
 }
 
 static void wsum_hess_init(expr *node)
@@ -43,6 +48,10 @@ static void wsum_hess_init(expr *node)
     /* we never have to store more than the sum of children's nnz */
     int nnz_max = node->left->wsum_hess->nnz + node->right->wsum_hess->nnz;
     node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz_max);
+
+    /* TODO: we should fill sparsity pattern here for consistency */
+    sum_csr_matrices_fill_sparsity(node->left->wsum_hess, node->right->wsum_hess,
+                                   node->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -52,7 +61,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->right->eval_wsum_hess(node->right, w);
 
     /* sum children's wsum_hess */
-    sum_csr_matrices(node->left->wsum_hess, node->right->wsum_hess, node->wsum_hess);
+    sum_csr_matrices_fill_values(node->left->wsum_hess, node->right->wsum_hess,
+                                 node->wsum_hess);
 }
 
 static bool is_affine(const expr *node)
@@ -74,5 +84,8 @@ expr *new_add(expr *left, expr *right)
     node->wsum_hess_init = wsum_hess_init;
     node->eval_wsum_hess = eval_wsum_hess;
 
+    // just for debugging, should be removed
+    strcpy(node->name, "add");
+
     return node;
 }
diff --git a/src/affine/sum.c b/src/affine/sum.c
index ee9988b..7114124 100644
--- a/src/affine/sum.c
+++ b/src/affine/sum.c
@@ -5,6 +5,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 static void forward(expr *node, const double *u)
 {
     int i, j, end;
@@ -62,42 +64,46 @@ static void jacobian_init(expr *node)
 {
     expr *x = node->left;
     sum_expr *snode = (sum_expr *) node;
+    int axis = snode->axis;
 
     /* initialize child's jacobian */
     x->jacobian_init(x);
 
     /* we never have to store more than the child's nnz */
     node->jacobian = new_csr_matrix(node->d1, node->n_vars, x->jacobian->nnz);
-    snode->int_double_pairs = new_int_double_pair_array(x->jacobian->nnz);
-}
-
-static void eval_jacobian(expr *node)
-{
-    expr *x = node->left;
-    sum_expr *snode = (sum_expr *) node;
-    int axis = snode->axis;
-
-    /* evaluate child's jacobian */
-    x->eval_jacobian(x);
+    node->iwork = malloc(MAX(node->jacobian->n, x->jacobian->nnz) * sizeof(int));
+    snode->idx_map = malloc(x->jacobian->nnz * sizeof(int));
 
-    /* sum rows or columns of child's jacobian */
     if (axis == -1)
     {
-        sum_all_rows_csr(x->jacobian, node->jacobian, snode->int_double_pairs);
+        sum_all_rows_csr_fill_sparsity_and_idx_map(x->jacobian, node->jacobian,
+                                                   node->iwork, snode->idx_map);
     }
     else if (axis == 0)
     {
-        sum_block_of_rows_csr(x->jacobian, node->jacobian, snode->int_double_pairs,
-                              x->d1);
+        sum_block_of_rows_csr_fill_sparsity_and_idx_map(
+            x->jacobian, node->jacobian, x->d1, node->iwork, snode->idx_map);
     }
-
     else if (axis == 1)
     {
-        sum_evenly_spaced_rows_csr(x->jacobian, node->jacobian,
-                                   snode->int_double_pairs, node->d1);
+        sum_evenly_spaced_rows_csr_fill_sparsity_and_idx_map(
+            x->jacobian, node->jacobian, node->d1, node->iwork, snode->idx_map);
     }
 }
 
+static void eval_jacobian(expr *node)
+{
+    expr *x = node->left;
+
+    /* evaluate child's jacobian */
+    x->eval_jacobian(x);
+
+    /* we have precomputed an idx map between the nonzeros of the child's jacobian
+       and this node's jacobian, so we just accumulate accordingly */
+    idx_map_accumulator(x->jacobian, ((sum_expr *) node)->idx_map,
+                        node->jacobian->x);
+}
+
 static void wsum_hess_init(expr *node)
 {
     expr *x = node->left;
@@ -107,6 +113,10 @@ static void wsum_hess_init(expr *node)
     /* we never have to store more than the child's nnz */
     node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, x->wsum_hess->nnz);
     node->dwork = malloc(x->size * sizeof(double));
+
+    /* copy sparsity pattern */
+    memcpy(node->wsum_hess->p, x->wsum_hess->p, (x->n_vars + 1) * sizeof(int));
+    memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -130,8 +140,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->dwork);
 
-    /* todo: is this copy necessary or can we just change pointers? */
-    copy_csr_matrix(x->wsum_hess, node->wsum_hess);
+    /* copy values (TODO: is this necessary or can we just change pointers?)*/
+    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
@@ -142,7 +152,7 @@ static bool is_affine(const expr *node)
 static void free_type_data(expr *node)
 {
     sum_expr *snode = (sum_expr *) node;
-    free_int_double_pair_array(snode->int_double_pairs);
+    free(snode->idx_map);
 }
 
 expr *new_sum(expr *child, int axis)
@@ -179,7 +189,6 @@ expr *new_sum(expr *child, int axis)
 
     /* Set type-specific fields */
     snode->axis = axis;
-    snode->int_double_pairs = NULL;
 
     return node;
 }
diff --git a/src/affine/trace.c b/src/affine/trace.c
new file mode 100644
index 0000000..a424277
--- /dev/null
+++ b/src/affine/trace.c
@@ -0,0 +1,137 @@
+#include "affine.h"
+#include "utils/int_double_pair.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void forward(expr *node, const double *u)
+{
+    expr *x = node->left;
+
+    /* child's forward pass */
+    x->forward(x, u);
+
+    /* local forward pass */
+    double sum = 0.0;
+    int row_spacing = x->d1 + 1;
+    for (int idx = 0; idx < x->size; idx += row_spacing)
+    {
+        sum += x->value[idx];
+    }
+
+    node->value[0] = sum;
+}
+
+static void jacobian_init(expr *node)
+{
+    expr *x = node->left;
+
+    /* initialize child's jacobian */
+    x->jacobian_init(x);
+
+    /* count total nnz in the rows of the jacobian that should be summed */
+    const CSR_Matrix *A = x->jacobian;
+    int total_nnz = 0;
+    int row_spacing = x->d1 + 1;
+    for (int row = 0; row < A->m; row += row_spacing)
+    {
+        total_nnz += (A->p[row + 1] - A->p[row]);
+    }
+
+    node->jacobian = new_csr_matrix(1, node->n_vars, total_nnz);
+    ((trace_expr *) node)->int_double_pairs = new_int_double_pair_array(total_nnz);
+}
+
+static void eval_jacobian(expr *node)
+{
+    expr *x = node->left;
+    trace_expr *tnode = (trace_expr *) node;
+
+    /* evaluate child's jacobian */
+    x->eval_jacobian(x);
+
+    /* local jacobian */
+    sum_spaced_rows_into_row_csr(x->jacobian, node->jacobian,
+                                 tnode->int_double_pairs, 0, x->d1 + 1);
+}
+
+/* Placeholders for Hessian-related functions */
+static void wsum_hess_init(expr *node)
+{
+    expr *x = node->left;
+
+    /* initialize child's hessian */
+    x->wsum_hess_init(x);
+
+    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, x->wsum_hess->nnz);
+    node->dwork = (double *) calloc(x->size, sizeof(double));
+
+    /* TODO: here we could copy over sparsity pattern once we have checked
+    that all atoms fill their sparsity pattern in the init functions. Perhaps
+    we should only take sparsity pattern of rows that are summed? Not the rows
+    which will get zero weight in the hessian. That would be very cool.
+    But must eval_wsum_hess then also ignore contributions with zero weight? that
+    would be bad. */
+
+    /* lets conclude that the hessian can be made more sophisticated */
+
+    /* but perhaps let's keep it as simple as possible! */
+}
+
+static void eval_wsum_hess(expr *node, const double *w)
+{
+    expr *x = node->left;
+
+    int row_spacing = x->d1 + 1;
+    for (int i = 0; i < x->size; i += row_spacing)
+    {
+        node->dwork[i] = w[0];
+    }
+
+    x->eval_wsum_hess(x, node->dwork);
+
+    /* TODO: here we only need to copy over values once we have filled the sparsity
+     * pattern in wsum_hess_init*/
+    node->wsum_hess->nnz = x->wsum_hess->nnz;
+    memcpy(node->wsum_hess->p, x->wsum_hess->p, sizeof(int) * (node->n_vars + 1));
+    memcpy(node->wsum_hess->i, x->wsum_hess->i, sizeof(int) * x->wsum_hess->nnz);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x, sizeof(double) * x->wsum_hess->nnz);
+
+    /* This might contain many many zeros actually! Hmm...*/
+}
+
+static bool is_affine(const expr *node)
+{
+    return node->left->is_affine(node->left);
+}
+
+static void free_type_data(expr *node)
+{
+    trace_expr *tnode = (trace_expr *) node;
+    free_int_double_pair_array(tnode->int_double_pairs);
+}
+
+expr *new_trace(expr *child)
+{
+    /* Output is a single scalar */
+    int d1 = 1;
+
+    trace_expr *tnode = (trace_expr *) calloc(1, sizeof(trace_expr));
+    expr *node = &tnode->base;
+    init_expr(node, d1, 1, child->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, free_type_data);
+    node->left = child;
+    expr_retain(child);
+
+    /* Hessian placeholders */
+    node->wsum_hess_init = wsum_hess_init;
+    node->eval_wsum_hess = eval_wsum_hess;
+
+    /* Initialize type-specific fields */
+    tnode->int_double_pairs = NULL;
+
+    // just for debugging, should be removed
+    strcpy(node->name, "trace");
+
+    return node;
+}
diff --git a/src/affine/variable.c b/src/affine/variable.c
index 46e2446..43b706f 100644
--- a/src/affine/variable.c
+++ b/src/affine/variable.c
@@ -9,15 +9,20 @@ static void forward(expr *node, const double *u)
 
 static void jacobian_init(expr *node)
 {
-    int size = node->d1 * node->d2;
-    node->jacobian = new_csr_matrix(size, node->n_vars, size);
-    for (int j = 0; j < size; j++)
+    node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+    for (int j = 0; j < node->size; j++)
     {
         node->jacobian->p[j] = j;
         node->jacobian->i[j] = j + node->var_id;
         node->jacobian->x[j] = 1.0;
     }
-    node->jacobian->p[size] = size;
+    node->jacobian->p[node->size] = node->size;
+}
+
+static void eval_jacobian(expr *node)
+{
+    /* Jacobian is already initialized with correct values */
+    (void) node;
 }
 
 static bool is_affine(const expr *node)
@@ -33,6 +38,7 @@ expr *new_variable(int d1, int d2, int var_id, int n_vars)
     node->var_id = var_id;
     node->is_affine = is_affine;
     node->jacobian_init = jacobian_init;
+    node->eval_jacobian = eval_jacobian;
 
     return node;
 }
diff --git a/src/bivariate/multiply.c b/src/bivariate/multiply.c
index e50882b..63c9290 100644
--- a/src/bivariate/multiply.c
+++ b/src/bivariate/multiply.c
@@ -44,6 +44,10 @@ static void jacobian_init(expr *node)
     node->dwork = (double *) malloc(2 * node->size * sizeof(double));
     int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
     node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
+
+    /* fill sparsity pattern */
+    sum_csr_matrices_fill_sparsity(node->left->jacobian, node->right->jacobian,
+                                   node->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -52,8 +56,8 @@ static void eval_jacobian(expr *node)
     expr *y = node->right;
 
     /* chain rule */
-    sum_scaled_csr_matrices(x->jacobian, y->jacobian, node->jacobian, y->value,
-                            x->value);
+    sum_scaled_csr_matrices_fill_values(x->jacobian, y->jacobian, node->jacobian,
+                                        y->value, x->value);
 }
 
 static void wsum_hess_init(expr *node)
diff --git a/src/elementwise_univariate/common.c b/src/elementwise_univariate/common.c
index 752f107..c1c412e 100644
--- a/src/elementwise_univariate/common.c
+++ b/src/elementwise_univariate/common.c
@@ -21,9 +21,14 @@ void jacobian_init_elementwise(expr *node)
     /* otherwise it will be a linear operator */
     else
     {
-        node->jacobian = new_csr_matrix(child->jacobian->m, child->jacobian->n,
-                                        child->jacobian->nnz);
+        CSR_Matrix *J = child->jacobian;
+        node->jacobian = new_csr_matrix(J->m, J->n, J->nnz);
+
         node->dwork = (double *) malloc(node->size * sizeof(double));
+
+        /* copy sparsity pattern of child */
+        memcpy(node->jacobian->p, J->p, sizeof(int) * (J->m + 1));
+        memcpy(node->jacobian->i, J->i, sizeof(int) * J->nnz);
     }
 }
 
@@ -40,7 +45,7 @@ void eval_jacobian_elementwise(expr *node)
         /* Child will be a linear operator */
         linear_op_expr *lin_child = (linear_op_expr *) child;
         node->local_jacobian(node, node->dwork);
-        diag_csr_mult(node->dwork, lin_child->A_csr, node->jacobian);
+        diag_csr_mult_fill_values(node->dwork, lin_child->A_csr, node->jacobian);
     }
 }
 
diff --git a/src/elementwise_univariate/exp.c b/src/elementwise_univariate/exp.c
index 3b27c15..1f820c0 100644
--- a/src/elementwise_univariate/exp.c
+++ b/src/elementwise_univariate/exp.c
@@ -33,5 +33,8 @@ expr *new_exp(expr *child)
     node->forward = forward;
     node->local_jacobian = local_jacobian;
     node->local_wsum_hess = local_wsum_hess;
+
+    // just for debugging, should be removed
+    strcpy(node->name, "exp");
     return node;
 }
diff --git a/src/elementwise_univariate/log.c b/src/elementwise_univariate/log.c
index 637dde3..6722747 100644
--- a/src/elementwise_univariate/log.c
+++ b/src/elementwise_univariate/log.c
@@ -38,5 +38,8 @@ expr *new_log(expr *child)
     node->forward = forward;
     node->local_jacobian = local_jacobian;
     node->local_wsum_hess = local_wsum_hess;
+
+    // just for debugging, should be removed
+    strcpy(node->name, "log");
     return node;
 }
diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_Matrix.c
index 8ee83e5..5e9aed8 100644
--- a/src/utils/CSR_Matrix.c
+++ b/src/utils/CSR_Matrix.c
@@ -97,6 +97,26 @@ void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
     }
 }
 
+void diag_csr_mult_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
+{
+    memcpy(C->x, A->x, A->nnz * sizeof(double));
+
+    for (int row = 0; row < C->m; row++)
+    {
+        for (int j = C->p[row]; j < C->p[row + 1]; j++)
+        {
+            C->x[j] *= d[row];
+        }
+    }
+}
+
+static int compare_int_asc(const void *a, const void *b)
+{
+    int ia = *((const int *) a);
+    int ib = *((const int *) b);
+    return (ia > ib) - (ia < ib);
+}
+
 void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
 {
     /* A and B must be different from C */
@@ -159,6 +179,131 @@ void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
     C->p[A->m] = C->nnz;
 }
 
+void sum_csr_matrices_fill_sparsity(const CSR_Matrix *A, const CSR_Matrix *B,
+                                    CSR_Matrix *C)
+{
+    /* A and B must be different from C */
+    assert(A != C && B != C);
+
+    C->nnz = 0;
+
+    for (int row = 0; row < A->m; row++)
+    {
+        int a_ptr = A->p[row];
+        int a_end = A->p[row + 1];
+        int b_ptr = B->p[row];
+        int b_end = B->p[row + 1];
+        C->p[row] = C->nnz;
+
+        /* Merge while both have elements (only column indices) */
+        while (a_ptr < a_end && b_ptr < b_end)
+        {
+            if (A->i[a_ptr] < B->i[b_ptr])
+            {
+                C->i[C->nnz] = A->i[a_ptr];
+                a_ptr++;
+            }
+            else if (B->i[b_ptr] < A->i[a_ptr])
+            {
+                C->i[C->nnz] = B->i[b_ptr];
+                b_ptr++;
+            }
+            else
+            {
+                C->i[C->nnz] = A->i[a_ptr];
+                a_ptr++;
+                b_ptr++;
+            }
+            C->nnz++;
+        }
+
+        /* Copy remaining elements from A */
+        if (a_ptr < a_end)
+        {
+            int a_remaining = a_end - a_ptr;
+            memcpy(C->i + C->nnz, A->i + a_ptr, a_remaining * sizeof(int));
+            C->nnz += a_remaining;
+        }
+
+        /* Copy remaining elements from B */
+        if (b_ptr < b_end)
+        {
+            int b_remaining = b_end - b_ptr;
+            memcpy(C->i + C->nnz, B->i + b_ptr, b_remaining * sizeof(int));
+            C->nnz += b_remaining;
+        }
+    }
+
+    C->p[A->m] = C->nnz;
+}
+
+void sum_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
+                                  CSR_Matrix *C)
+{
+    /* Assumes C->p and C->i already contain the sparsity pattern of A+B.
+       Fills only C->x accordingly. */
+    for (int row = 0; row < A->m; row++)
+    {
+        int a_ptr = A->p[row];
+        int a_end = A->p[row + 1];
+        int b_ptr = B->p[row];
+        int b_end = B->p[row + 1];
+
+        for (int c_ptr = C->p[row]; c_ptr < C->p[row + 1]; c_ptr++)
+        {
+            int col = C->i[c_ptr];
+            double val = 0.0;
+
+            if (a_ptr < a_end && A->i[a_ptr] == col)
+            {
+                val += A->x[a_ptr];
+                a_ptr++;
+            }
+
+            if (b_ptr < b_end && B->i[b_ptr] == col)
+            {
+                val += B->x[b_ptr];
+                b_ptr++;
+            }
+            C->x[c_ptr] = val;
+        }
+    }
+}
+
+void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
+                                         CSR_Matrix *C, const double *d1,
+                                         const double *d2)
+{
+    /* Assumes C->p and C->i already contain the sparsity pattern of A+B.
+       Fills only C->x accordingly with scaling. */
+    for (int row = 0; row < A->m; row++)
+    {
+        int a_ptr = A->p[row];
+        int a_end = A->p[row + 1];
+        int b_ptr = B->p[row];
+        int b_end = B->p[row + 1];
+
+        for (int c_ptr = C->p[row]; c_ptr < C->p[row + 1]; c_ptr++)
+        {
+            int col = C->i[c_ptr];
+            double val = 0.0;
+
+            if (a_ptr < a_end && A->i[a_ptr] == col)
+            {
+                val += d1[row] * A->x[a_ptr];
+                a_ptr++;
+            }
+
+            if (b_ptr < b_end && B->i[b_ptr] == col)
+            {
+                val += d2[row] * B->x[b_ptr];
+                b_ptr++;
+            }
+            C->x[c_ptr] = val;
+        }
+    }
+}
+
 void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
                              const double *d1, const double *d2)
 {
@@ -321,6 +466,96 @@ void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
+/* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz */
+void sum_block_of_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
+                                                     CSR_Matrix *C,
+                                                     int row_block_size, int *iwork,
+                                                     int *idx_map)
+{
+    assert(A->m % row_block_size == 0);
+    int n_blocks = A->m / row_block_size;
+    assert(C->m == n_blocks);
+
+    C->n = A->n;
+    C->p[0] = 0;
+    C->nnz = 0;
+
+    int *cols = iwork;
+    int *col_to_pos = iwork;
+
+    for (int block = 0; block < n_blocks; block++)
+    {
+        int start_row = block * row_block_size;
+        int end_row = start_row + row_block_size;
+
+        // -----------------------------------------------------------------
+        // Build sparsity pattern of the row resulting from summing
+        // the block of rows from A
+        // -----------------------------------------------------------------
+        C->p[block] = C->nnz;
+        int count = 0;
+        for (int row = start_row; row < end_row; row++)
+        {
+            for (int j = A->p[row]; j < A->p[row + 1]; j++)
+            {
+                cols[count++] = A->i[j];
+            }
+        }
+
+        /* Sort columns and write unique pattern into C->i */
+        qsort(cols, count, sizeof(int), compare_int_asc);
+
+        int unique_nnz = 0;
+        int prev_col = -1;
+        for (int t = 0; t < count; t++)
+        {
+            int col = cols[t];
+            if (t == 0 || col != prev_col)
+            {
+                C->i[C->nnz + unique_nnz] = col;
+                prev_col = col;
+                unique_nnz++;
+            }
+        }
+
+        C->nnz += unique_nnz;
+        C->p[block + 1] = C->nnz;
+
+        // -----------------------------------------------------------------
+        //         Build idx_map for all entries in this block
+        // -----------------------------------------------------------------
+        int row_start = C->p[block];
+        for (int idx = 0; idx < unique_nnz; idx++)
+        {
+            col_to_pos[C->i[row_start + idx]] = row_start + idx;
+        }
+
+        for (int row = start_row; row < end_row; row++)
+        {
+            for (int j = A->p[row]; j < A->p[row + 1]; j++)
+            {
+                idx_map[j] = col_to_pos[A->i[j]];
+            }
+        }
+    }
+}
+
+/*
+void sum_block_of_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+                                       const int *idx_map)
+{
+    memset(C->x, 0, C->nnz * sizeof(double));
+
+    for (int row = 0; row < A->m; row++)
+    {
+        for (int j = A->p[row]; j < A->p[row + 1]; j++)
+        {
+            C->x[idx_map[j]] += A->x[j];
+        }
+    }
+}
+*/
+
 void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
                                 struct int_double_pair *pairs, int row_spacing)
 {
@@ -369,6 +604,149 @@ void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
+/* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz */
+void sum_evenly_spaced_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
+                                                          CSR_Matrix *C,
+                                                          int row_spacing,
+                                                          int *iwork, int *idx_map)
+{
+    assert(C->m == row_spacing);
+    C->n = A->n;
+    C->p[0] = 0;
+    C->nnz = 0;
+
+    int *cols = iwork;
+    int *col_to_pos = iwork;
+
+    for (int C_row = 0; C_row < C->m; C_row++)
+    {
+        // -----------------------------------------------------------------
+        // Build sparsity pattern of the row resulting from summing
+        // evenly spaced rows from A
+        // -----------------------------------------------------------------
+        C->p[C_row] = C->nnz;
+        int count = 0;
+        for (int row = C_row; row < A->m; row += row_spacing)
+        {
+            for (int j = A->p[row]; j < A->p[row + 1]; j++)
+            {
+                cols[count++] = A->i[j];
+            }
+        }
+
+        /* Sort columns and write unique pattern into C->i */
+        qsort(cols, count, sizeof(int), compare_int_asc);
+
+        int unique_nnz = 0;
+        int prev_col = -1;
+        for (int t = 0; t < count; t++)
+        {
+            int col = cols[t];
+            if (t == 0 || col != prev_col)
+            {
+                C->i[C->nnz + unique_nnz] = col;
+                prev_col = col;
+                unique_nnz++;
+            }
+        }
+
+        C->nnz += unique_nnz;
+        C->p[C_row + 1] = C->nnz;
+
+        // -----------------------------------------------------------------
+        //         Build idx_map for all entries in evenly spaced rows
+        // -----------------------------------------------------------------
+        int row_start = C->p[C_row];
+        for (int idx = 0; idx < unique_nnz; idx++)
+        {
+            col_to_pos[C->i[row_start + idx]] = row_start + idx;
+        }
+
+        for (int row = C_row; row < A->m; row += row_spacing)
+        {
+            for (int j = A->p[row]; j < A->p[row + 1]; j++)
+            {
+                idx_map[j] = col_to_pos[A->i[j]];
+            }
+        }
+    }
+}
+
+void idx_map_accumulator(const CSR_Matrix *A, const int *idx_map,
+                         double *accumulator)
+{
+    memset(accumulator, 0, A->nnz * sizeof(double));
+
+    for (int row = 0; row < A->m; row++)
+    {
+        for (int j = A->p[row]; j < A->p[row + 1]; j++)
+        {
+            accumulator[idx_map[j]] += A->x[j];
+        }
+    }
+}
+
+/*
+void sum_evenly_spaced_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+                                            const int *idx_map)
+{
+    memset(C->x, 0, C->nnz * sizeof(double));
+
+    for (int row = 0; row < A->m; row++)
+    {
+        for (int j = A->p[row]; j < A->p[row + 1]; j++)
+        {
+            C->x[idx_map[j]] += A->x[j];
+        }
+    }
+}
+*/
+void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
+                                  struct int_double_pair *pairs, int offset,
+                                  int spacing)
+{
+    assert(C->m == 1);
+    C->n = A->n;
+    C->p[0] = 0;
+    C->nnz = 0;
+
+    /* copy evenly spaced rows starting at offset into pairs */
+    int pair_idx = 0;
+    for (int row = offset; row < A->m; row += spacing)
+    {
+        for (int j = A->p[row]; j < A->p[row + 1]; j++)
+        {
+            pairs[pair_idx].col = A->i[j];
+            pairs[pair_idx].val = A->x[j];
+            pair_idx++;
+        }
+    }
+
+    /* sort so columns are in order */
+    sort_int_double_pair_array(pairs, pair_idx);
+
+    /* merge entries with same columns and insert result in C */
+    for (int j = 0; j < pair_idx;)
+    {
+        int current_col = pairs[j].col;
+        double sum_val = 0.0;
+
+        /* sum all values with the same column */
+        while (j < pair_idx && pairs[j].col == current_col)
+        {
+            sum_val += pairs[j].val;
+            j++;
+        }
+
+        /* insert into C */
+        C->i[C->nnz] = current_col;
+        C->x[C->nnz] = sum_val;
+        C->nnz++;
+    }
+
+    C->p[1] = C->nnz;
+}
+
 void csr_insert_value(CSR_Matrix *A, int col_idx, double value)
 {
     assert(A->m == 1);
@@ -599,3 +977,63 @@ void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_Matrix *C)
 
     free(counts);
 }
+
+void sum_all_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A, CSR_Matrix *C,
+                                                int *iwork, int *idx_map)
+{
+    // -------------------------------------------------------------------
+    //           Build sparsity pattern of the summed row
+    // -------------------------------------------------------------------
+    int *cols = iwork;
+    memcpy(cols, A->i, A->nnz * sizeof(int));
+    qsort(cols, A->nnz, sizeof(int), compare_int_asc);
+
+    int unique_nnz = 0;
+    int prev_col = -1;
+    for (int j = 0; j < A->nnz; j++)
+    {
+        if (cols[j] != prev_col)
+        {
+            C->i[unique_nnz] = cols[j];
+            prev_col = cols[j];
+            unique_nnz++;
+        }
+    }
+
+    C->p[0] = 0;
+    C->p[1] = unique_nnz;
+    C->nnz = unique_nnz;
+
+    // -------------------------------------------------------------------
+    //         Map child values to summed-row positions
+    // -------------------------------------------------------------------
+    int *col_to_pos = iwork;
+    for (int idx = 0; idx < unique_nnz; idx++)
+    {
+        col_to_pos[C->i[idx]] = idx;
+    }
+
+    for (int i = 0; i < A->m; i++)
+    {
+        for (int j = A->p[i]; j < A->p[i + 1]; j++)
+        {
+            idx_map[j] = col_to_pos[A->i[j]];
+        }
+    }
+}
+
+/*
+void sum_all_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+                                  const int *idx_map)
+{
+    memset(C->x, 0, C->nnz * sizeof(double));
+
+    for (int row = 0; row < A->m; row++)
+    {
+        for (int j = A->p[row]; j < A->p[row + 1]; j++)
+        {
+            C->x[idx_map[j]] += A->x[j];
+        }
+    }
+}
+*/
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 2801647..768ba86 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -20,6 +20,7 @@
 #include "jacobian_tests/test_quad_over_lin.h"
 #include "jacobian_tests/test_rel_entr.h"
 #include "jacobian_tests/test_sum.h"
+#include "jacobian_tests/test_trace.h"
 #include "utils/test_csc_matrix.h"
 #include "utils/test_csr_matrix.h"
 #include "wsum_hess/elementwise/test_entr.h"
@@ -37,6 +38,7 @@
 #include "wsum_hess/test_quad_over_lin.h"
 #include "wsum_hess/test_rel_entr.h"
 #include "wsum_hess/test_sum.h"
+#include "wsum_hess/test_trace.h"
 
 int main(void)
 {
@@ -87,8 +89,8 @@ int main(void)
     mu_run_test(test_jacobian_sum_log_axis_1, tests_run);
     mu_run_test(test_jacobian_hstack_vectors, tests_run);
     mu_run_test(test_jacobian_hstack_matrix, tests_run);
-    mu_run_test(test_wsum_hess_multiply_1, tests_run);
-    mu_run_test(test_wsum_hess_multiply_2, tests_run);
+    mu_run_test(test_jacobian_trace_variable, tests_run);
+    mu_run_test(test_jacobian_trace_composite, tests_run);
 
     printf("\n--- Weighted Sum of Hessian Tests ---\n");
     mu_run_test(test_wsum_hess_log, tests_run);
@@ -121,6 +123,14 @@ int main(void)
     mu_run_test(test_wsum_hess_quad_form, tests_run);
     mu_run_test(test_wsum_hess_multiply_linear_ops, tests_run);
     mu_run_test(test_wsum_hess_multiply_sparse_random, tests_run);
+    mu_run_test(test_wsum_hess_multiply_1, tests_run);
+    mu_run_test(test_wsum_hess_multiply_2, tests_run);
+    // This test leads to seg fault
+    // mu_run_test(test_wsum_hess_trace_variable, tests_run);
+
+    // This test fails - not sure how sophisticated we should make
+    // wsum_hess for trace
+    // mu_run_test(test_wsum_hess_trace_composite, tests_run);
 
     printf("\n--- Utility Tests ---\n");
     mu_run_test(test_diag_csr_mult, tests_run);
diff --git a/tests/jacobian_tests/test_trace.h b/tests/jacobian_tests/test_trace.h
new file mode 100644
index 0000000..b2f9810
--- /dev/null
+++ b/tests/jacobian_tests/test_trace.h
@@ -0,0 +1,103 @@
+#include <stdio.h>
+
+#include "affine.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_jacobian_trace_variable()
+{
+    /* Test Jacobian of trace(x) where x is 3x3 variable
+     * x has global variable index 1
+     * Total 13 variables
+     * x.value = [1, 2, 3, 4, 5, 6, 7, 8, 9] (column-wise)
+     * Stored as:
+     * [[1, 4, 7],
+     *  [2, 5, 8],
+     *  [3, 6, 9]]
+     * Diagonal: [1, 5, 9]
+     * trace(x) = 1 + 5 + 9 = 15
+     *
+     * Jacobian: d(trace)/dx has nonzeros only at diagonal positions
+     * Indices (0-indexed, column-wise): 0, 4, 8
+     * So global variable indices: 1, 5, 9 (offset by 1)
+     * Values: [1, 1, 1]
+     * Result is 1x13 sparse matrix with 3 nonzeros
+     */
+    double u_vals[13] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                         7.0, 8.0, 9.0, 0.0, 0.0, 0.0};
+    expr *x = new_variable(3, 3, 1, 13);
+    expr *trace_node = new_trace(x);
+
+    trace_node->forward(trace_node, u_vals);
+    trace_node->jacobian_init(trace_node);
+    trace_node->eval_jacobian(trace_node);
+
+    double expected_Ax[3] = {1.0, 1.0, 1.0};
+    int expected_Ap[2] = {0, 3};
+    int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
+
+    mu_assert("vals fail",
+              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
+    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->i, expected_Ai, 3));
+
+    free_expr(trace_node);
+    return 0;
+}
+
+const char *test_jacobian_trace_composite()
+{
+    /* Test Jacobian of trace(log(x) + exp(x)) where x is 3x3 variable
+     * x has global variable index 1
+     * Total 13 variables
+     * x.value = [1, 2, 3, 4, 5, 6, 7, 8, 9] (column-wise)
+     * Diagonal elements: x[0, 0] = 1, x[1, 1] = 5, x[2, 2] = 9
+     *
+     * log(x) + exp(x) elementwise: [log(x_ij) + exp(x_ij)]
+     * At diagonal: [log(1) + exp(1), log(5) + exp(5), log(9) + exp(9)]
+     *
+     * d/dx (log(x) + exp(x)) = [1/x + exp(x)]
+     * At diagonal elements:
+     * x[0,0] = 1: 1/1 + exp(1) ≈ 1 + 2.718... ≈ 3.718...
+     * x[1,1] = 5: 1/5 + exp(5) = 0.2 + 148.413... ≈ 148.613...
+     * x[2,2] = 9: 1/9 + exp(9) ≈ 0.111... + 8103.08... ≈ 8103.19...
+     *
+     * Jacobian (1x13 sparse):
+     * Nonzeros at columns [1, 5, 9] with values above
+     * We'll compute and check the actual values
+     */
+    double u_vals[13] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                         7.0, 8.0, 9.0, 0.0, 0.0, 0.0};
+    expr *x = new_variable(3, 3, 1, 13);
+    expr *log_node = new_log(x);
+    expr *exp_node = new_exp(x);
+    expr *add_node = new_add(log_node, exp_node);
+    expr *trace_node = new_trace(add_node);
+
+    trace_node->jacobian_init(trace_node);
+    trace_node->forward(trace_node, u_vals);
+    trace_node->eval_jacobian(trace_node);
+
+    /* Expected values: d(log(x_ii) + exp(x_ii))/dx_ii = 1/x_ii + exp(x_ii)
+     * At x_00 = 1: 1/1 + exp(1) = 1 + 2.718281828...
+     * At x_11 = 5: 1/5 + exp(5) = 0.2 + 148.413159103...
+     * At x_22 = 9: 1/9 + exp(9) = 0.111... + 8103.083927576...
+     */
+    double exp_1 = 2.718281828;
+    double exp_5 = 148.413159103;
+    double exp_9 = 8103.083927576;
+    double expected_Ax[3] = {1.0 + exp_1, 0.2 + exp_5, 1.0 / 9.0 + exp_9};
+    int expected_Ap[2] = {0, 3};
+    int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
+
+    mu_assert("vals match count", trace_node->jacobian->nnz == 3);
+    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail",
+              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
+
+    free_expr(trace_node);
+    return 0;
+}
diff --git a/tests/wsum_hess/test_trace.h b/tests/wsum_hess/test_trace.h
new file mode 100644
index 0000000..372f8d1
--- /dev/null
+++ b/tests/wsum_hess/test_trace.h
@@ -0,0 +1,109 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "affine.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_wsum_hess_trace_variable()
+{
+    /* Test weighted sum of Hessian of trace(x) where x is 3x3 variable
+     * x has global variable index 1
+     * Total 13 variables
+     * x.value = [1, 2, 3, 4, 5, 6, 7, 8, 9] (column-wise)
+     * Weight = 2.0
+     *
+     * trace(x) = x[0,0] + x[1,1] + x[2,2] = 1 + 5 + 9 = 15
+     * Since x is linear (identity mapping), Hessian is zero.
+     * wsum_hess should be a 13x13 zero sparse matrix (or empty)
+     */
+    double u_vals[13] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                         7.0, 8.0, 9.0, 0.0, 0.0, 0.0};
+    double w = 2.0;
+
+    expr *x = new_variable(3, 3, 1, 13);
+    expr *trace_node = new_trace(x);
+
+    trace_node->forward(trace_node, u_vals);
+    trace_node->jacobian_init(trace_node);
+    trace_node->wsum_hess_init(trace_node);
+    trace_node->eval_wsum_hess(trace_node, &w);
+
+    /* For a linear operation (variable), Hessian is zero */
+    mu_assert("wsum_hess should be empty", trace_node->wsum_hess->nnz == 0);
+
+    mu_assert("dims correct",
+              trace_node->wsum_hess->m == 13 && trace_node->wsum_hess->n == 13);
+
+    free_expr(trace_node);
+    return 0;
+}
+
+const char *test_wsum_hess_trace_composite()
+{
+    /* Test weighted sum of Hessian of trace(log(x) + exp(x)) where x is 3x3 variable
+     * x has global variable index 1
+     * Total 13 variables
+     * x.value = [1, 2, 3, 4, 5, 6, 7, 8, 9] (column-wise)
+     * Weight = 2.0
+     *
+     * f(x) = trace(log(x) + exp(x))
+     * d/dx (log(x) + exp(x)) = 1/x + exp(x)
+     * d²/dx² (log(x) + exp(x)) = -1/x² + exp(x)
+     *
+     * For diagonal elements:
+     * x[0,0] = 1: d²/dx² = -1/1² + exp(1) = -1 + 2.718... ≈ 1.718...
+     * x[1,1] = 5: d²/dx² = -1/25 + exp(5) = -0.04 + 148.413... ≈ 148.373...
+     * x[2,2] = 9: d²/dx² = -1/81 + exp(9) ≈ -0.0123... + 8103.08... ≈ 8103.07...
+     *
+     * wsum_hess (13x13) has nonzeros only at diagonal positions [1,1], [5,5], [9,9]
+     * with values = weight * second derivatives = 2 * values above
+     */
+    double u_vals[13] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                         7.0, 8.0, 9.0, 0.0, 0.0, 0.0};
+    double w = 2.0;
+
+    expr *x = new_variable(3, 3, 1, 13);
+    expr *log_node = new_log(x);
+    expr *exp_node = new_exp(x);
+    expr *add_node = new_add(log_node, exp_node);
+    expr *trace_node = new_trace(add_node);
+
+    trace_node->forward(trace_node, u_vals);
+    trace_node->jacobian_init(trace_node);
+    trace_node->wsum_hess_init(trace_node);
+    trace_node->eval_wsum_hess(trace_node, &w);
+
+    /* Expected diagonal Hessian values at indices [1,1], [5,5], [9,9]
+     * d²(log(x_ii) + exp(x_ii))/dx_ii² = -1/x_ii² + exp(x_ii)
+     * At x_00 = 1: -1 + exp(1) = -1 + 2.718281828...
+     * At x_11 = 5: -1/25 + exp(5) = -0.04 + 148.413159103...
+     * At x_22 = 9: -1/81 + exp(9) ≈ -0.01234... + 8103.083927576...
+     *
+     * wsum_hess values = 2 * second derivatives
+     */
+    double exp_1 = 2.718281828;
+    double exp_5 = 148.413159103;
+    double exp_9 = 8103.083927576;
+
+    double d2_1 = -1.0 + exp_1;
+    double d2_5 = -0.04 + exp_5;
+    double d2_9 = -1.0 / 81.0 + exp_9;
+
+    double expected_Ax[3] = {w * d2_1, w * d2_5, w * d2_9};
+    int expected_Ap[14] = {0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3};
+    int expected_Ai[3] = {1, 5, 9};
+
+    mu_assert("nnz wrong", trace_node->wsum_hess->nnz == 3);
+    mu_assert("rows fail", cmp_int_array(trace_node->wsum_hess->p, expected_Ap, 14));
+    mu_assert("vals match",
+              cmp_double_array(trace_node->wsum_hess->x, expected_Ax, 3));
+    mu_assert("cols match", cmp_int_array(trace_node->wsum_hess->i, expected_Ai, 3));
+
+    free_expr(trace_node);
+    return 0;
+}