From b9071a7c25dd0c8289026f18b78e5aeff55f1552 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Wed, 21 Jan 2026 21:53:18 -0500
Subject: [PATCH 1/2] Add diag_vec atom for creating diagonal matrices from
 vectors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements diag_vec which converts a vector of size n into an n×n diagonal
matrix. Includes forward pass, Jacobian, and Hessian computations.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 include/affine.h                 |   1 +
 python/atoms/diag_vec.h          |  32 +++++++
 python/bindings.c                |   2 +
 src/affine/diag_vec.c            | 140 +++++++++++++++++++++++++++++++
 src/dnlp_diff_engine/__init__.py |   1 +
 5 files changed, 176 insertions(+)
 create mode 100644 python/atoms/diag_vec.h
 create mode 100644 src/affine/diag_vec.c

diff --git a/include/affine.h b/include/affine.h
index 5905e61..2405009 100644
--- a/include/affine.h
+++ b/include/affine.h
@@ -21,5 +21,6 @@ expr *new_variable(int d1, int d2, int var_id, int n_vars);
 expr *new_index(expr *child, int d1, int d2, const int *indices, int n_idxs);
 expr *new_reshape(expr *child, int d1, int d2);
 expr *new_broadcast(expr *child, int target_d1, int target_d2);
+expr *new_diag_vec(expr *child);
 
 #endif /* AFFINE_H */
diff --git a/python/atoms/diag_vec.h b/python/atoms/diag_vec.h
new file mode 100644
index 0000000..b1056d6
--- /dev/null
+++ b/python/atoms/diag_vec.h
@@ -0,0 +1,32 @@
+#ifndef ATOM_DIAG_VEC_H
+#define ATOM_DIAG_VEC_H
+
+#include "common.h"
+
+static PyObject *py_make_diag_vec(PyObject *self, PyObject *args)
+{
+    PyObject *child_capsule;
+
+    if (!PyArg_ParseTuple(args, "O", &child_capsule))
+    {
+        return NULL;
+    }
+
+    expr *child = (expr *) PyCapsule_GetPointer(child_capsule, EXPR_CAPSULE_NAME);
+    if (!child)
+    {
+        return NULL;
+    }
+
+    expr *node = new_diag_vec(child);
+    if (!node)
+    {
+        PyErr_SetString(PyExc_RuntimeError, "failed to create diag_vec node");
+        return NULL;
+    }
+
+    expr_retain(node);
+    return PyCapsule_New(node, EXPR_CAPSULE_NAME, expr_capsule_destructor);
+}
+
+#endif /* ATOM_DIAG_VEC_H */
diff --git a/python/bindings.c b/python/bindings.c
index 10f263d..6998f0f 100644
--- a/python/bindings.c
+++ b/python/bindings.c
@@ -11,6 +11,7 @@
 #include "atoms/const_vector_mult.h"
 #include "atoms/constant.h"
 #include "atoms/cos.h"
+#include "atoms/diag_vec.h"
 #include "atoms/entr.h"
 #include "atoms/exp.h"
 #include "atoms/getters.h"
@@ -95,6 +96,7 @@ static PyMethodDef DNLPMethods[] = {
      "Create prod_axis_one node"},
     {"make_sin", py_make_sin, METH_VARARGS, "Create sin node"},
     {"make_cos", py_make_cos, METH_VARARGS, "Create cos node"},
+    {"make_diag_vec", py_make_diag_vec, METH_VARARGS, "Create diag_vec node"},
     {"make_tan", py_make_tan, METH_VARARGS, "Create tan node"},
     {"make_sinh", py_make_sinh, METH_VARARGS, "Create sinh node"},
     {"make_tanh", py_make_tanh, METH_VARARGS, "Create tanh node"},
diff --git a/src/affine/diag_vec.c b/src/affine/diag_vec.c
new file mode 100644
index 0000000..6f6a9b2
--- /dev/null
+++ b/src/affine/diag_vec.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "affine.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* diag_vec: converts a vector of size n into an n×n diagonal matrix.
+ * In Fortran (column-major) order, element i of the input maps to
+ * position i*(n+1) in the flattened output (the diagonal positions). */
+
+static void forward(expr *node, const double *u)
+{
+    expr *x = node->left;
+    int n = x->size;
+
+    /* child's forward pass */
+    x->forward(x, u);
+
+    /* zero-initialize output */
+    memset(node->value, 0, node->size * sizeof(double));
+
+    /* place input elements on the diagonal */
+    for (int i = 0; i < n; i++)
+    {
+        node->value[i * (n + 1)] = x->value[i];
+    }
+}
+
+static void jacobian_init(expr *node)
+{
+    expr *x = node->left;
+    int n = x->size;
+    x->jacobian_init(x);
+
+    CSR_Matrix *Jx = x->jacobian;
+
+    /* Output Jacobian has n² rows, but only n rows (diagonal positions) are non-zero.
+     * We allocate space for the same nnz as child, but with n² rows. */
+    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
+
+    /* Build row pointers: rows at diagonal positions copy from child,
+     * all other rows are empty */
+    J->p[0] = 0;
+    int child_row = 0;
+    for (int out_row = 0; out_row < node->size; out_row++)
+    {
+        if (out_row == child_row * (n + 1) && child_row < n)
+        {
+            /* This is a diagonal position - copy sparsity from child row */
+            int len = Jx->p[child_row + 1] - Jx->p[child_row];
+            memcpy(J->i + J->p[out_row], Jx->i + Jx->p[child_row], len * sizeof(int));
+            J->p[out_row + 1] = J->p[out_row] + len;
+            child_row++;
+        }
+        else
+        {
+            /* Non-diagonal row - empty */
+            J->p[out_row + 1] = J->p[out_row];
+        }
+    }
+
+    node->jacobian = J;
+}
+
+static void eval_jacobian(expr *node)
+{
+    expr *x = node->left;
+    int n = x->size;
+    x->eval_jacobian(x);
+
+    CSR_Matrix *J = node->jacobian;
+    CSR_Matrix *Jx = x->jacobian;
+
+    /* Copy values from child Jacobian to diagonal positions */
+    int child_row = 0;
+    for (int out_row = 0; out_row < node->size && child_row < n; out_row++)
+    {
+        if (out_row == child_row * (n + 1))
+        {
+            int len = J->p[out_row + 1] - J->p[out_row];
+            memcpy(J->x + J->p[out_row], Jx->x + Jx->p[child_row], len * sizeof(double));
+            child_row++;
+        }
+    }
+}
+
+static void wsum_hess_init(expr *node)
+{
+    expr *x = node->left;
+
+    /* initialize child's wsum_hess */
+    x->wsum_hess_init(x);
+
+    /* workspace for extracting diagonal weights */
+    node->dwork = (double *) calloc(x->size, sizeof(double));
+
+    /* Copy child's Hessian structure (diag_vec is linear, so its own Hessian is zero) */
+    CSR_Matrix *Hx = x->wsum_hess;
+    node->wsum_hess = new_csr_matrix(Hx->m, Hx->n, Hx->nnz);
+    memcpy(node->wsum_hess->p, Hx->p, (Hx->m + 1) * sizeof(int));
+    memcpy(node->wsum_hess->i, Hx->i, Hx->nnz * sizeof(int));
+}
+
+static void eval_wsum_hess(expr *node, const double *w)
+{
+    expr *x = node->left;
+    int n = x->size;
+
+    /* Extract weights from diagonal positions of w (which has n² elements) */
+    for (int i = 0; i < n; i++)
+    {
+        node->dwork[i] = w[i * (n + 1)];
+    }
+
+    /* Evaluate child's Hessian with extracted weights */
+    x->eval_wsum_hess(x, node->dwork);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+}
+
+static bool is_affine(const expr *node)
+{
+    return node->left->is_affine(node->left);
+}
+
+expr *new_diag_vec(expr *child)
+{
+    /* child must be a vector: either column (n, 1) or row (1, n) */
+    assert(child->d1 == 1 || child->d2 == 1);
+
+    /* n is the number of elements (works for both row and column vectors) */
+    int n = child->size;
+    expr *node = (expr *) calloc(1, sizeof(expr));
+    init_expr(node, n, n, child->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, wsum_hess_init, eval_wsum_hess, NULL);
+    node->left = child;
+    expr_retain(child);
+
+    return node;
+}
diff --git a/src/dnlp_diff_engine/__init__.py b/src/dnlp_diff_engine/__init__.py
index b1ab670..fae5fff 100644
--- a/src/dnlp_diff_engine/__init__.py
+++ b/src/dnlp_diff_engine/__init__.py
@@ -20,6 +20,7 @@
     "make_promote",
     "make_index",
     "make_reshape",
+    "make_diag_vec",
     "make_log",
     "make_exp",
     "make_power",

From 1168e68a188f5a5309e10f1aae27f4b574376d4c Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Wed, 21 Jan 2026 22:04:31 -0500
Subject: [PATCH 2/2] Clean up diag_vec jacobian_init and eval_jacobian

- Use standard CSR building pattern (J->p[row] = nnz)
- Use next_diag counter instead of checking row == child_row * (n+1)
- Simplify eval_jacobian to O(n) loop computing out_row directly

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/affine/diag_vec.c | 44 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/src/affine/diag_vec.c b/src/affine/diag_vec.c
index 6f6a9b2..5d2a303 100644
--- a/src/affine/diag_vec.c
+++ b/src/affine/diag_vec.c
@@ -34,31 +34,25 @@ static void jacobian_init(expr *node)
     x->jacobian_init(x);
 
     CSR_Matrix *Jx = x->jacobian;
-
-    /* Output Jacobian has n² rows, but only n rows (diagonal positions) are non-zero.
-     * We allocate space for the same nnz as child, but with n² rows. */
     CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
 
-    /* Build row pointers: rows at diagonal positions copy from child,
-     * all other rows are empty */
-    J->p[0] = 0;
-    int child_row = 0;
-    for (int out_row = 0; out_row < node->size; out_row++)
+    /* Output has n² rows but only n diagonal positions are non-empty.
+     * Diagonal position i is at row i*(n+1) in Fortran order. */
+    int nnz = 0;
+    int next_diag = 0;
+    for (int row = 0; row < node->size; row++)
     {
-        if (out_row == child_row * (n + 1) && child_row < n)
+        J->p[row] = nnz;
+        if (row == next_diag)
         {
-            /* This is a diagonal position - copy sparsity from child row */
+            int child_row = row / (n + 1);
             int len = Jx->p[child_row + 1] - Jx->p[child_row];
-            memcpy(J->i + J->p[out_row], Jx->i + Jx->p[child_row], len * sizeof(int));
-            J->p[out_row + 1] = J->p[out_row] + len;
-            child_row++;
-        }
-        else
-        {
-            /* Non-diagonal row - empty */
-            J->p[out_row + 1] = J->p[out_row];
+            memcpy(J->i + nnz, Jx->i + Jx->p[child_row], len * sizeof(int));
+            nnz += len;
+            next_diag += n + 1;
         }
     }
+    J->p[node->size] = nnz;
 
     node->jacobian = J;
 }
@@ -72,16 +66,12 @@ static void eval_jacobian(expr *node)
     CSR_Matrix *J = node->jacobian;
     CSR_Matrix *Jx = x->jacobian;
 
-    /* Copy values from child Jacobian to diagonal positions */
-    int child_row = 0;
-    for (int out_row = 0; out_row < node->size && child_row < n; out_row++)
+    /* Copy values from child row i to output diagonal row i*(n+1) */
+    for (int i = 0; i < n; i++)
     {
-        if (out_row == child_row * (n + 1))
-        {
-            int len = J->p[out_row + 1] - J->p[out_row];
-            memcpy(J->x + J->p[out_row], Jx->x + Jx->p[child_row], len * sizeof(double));
-            child_row++;
-        }
+        int out_row = i * (n + 1);
+        int len = J->p[out_row + 1] - J->p[out_row];
+        memcpy(J->x + J->p[out_row], Jx->x + Jx->p[i], len * sizeof(double));
     }
 }