diff --git a/.gitignore b/.gitignore
index f7de229..5f3d53d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,4 +75,4 @@ out/
 .DS_Store
 
 .venv/
-__pycache__/
\ No newline at end of file
+__pycache__/uv.lock
diff --git a/CLAUDE.md b/CLAUDE.md
index 43b2bfd..c44e1aa 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -58,12 +58,12 @@ The core abstraction is the `expr` struct (in `include/expr.h`) representing a n
 
 Atoms are organized by mathematical properties in `src/`:
 
-- **`affine/`** - Linear operations: `variable`, `constant`, `add`, `neg`, `sum`, `promote`, `hstack`, `trace`, `linear_op`, `index`
-- **`elementwise_univariate/`** - Scalar functions applied elementwise: `log`, `exp`, `entr`, `power`, `logistic`, `xexp`, trigonometric (`sin`, `cos`, `tan`), hyperbolic (`sinh`, `tanh`, `asinh`, `atanh`)
-- **`bivariate/`** - Two-argument operations: `multiply`, `quad_over_lin`, `rel_entr`, `const_scalar_mult`, `const_vector_mult`, `left_matmul`, `right_matmul`
-- **`other/`** - Special atoms: `quad_form`, `prod`
+- **`affine/`** - Linear operations: `variable`, `constant`, `add`, `neg`, `sum`, `promote`, `hstack`, `trace`, `linear_op`, `index`, `reshape`
+- **`elementwise_univariate/`** - Scalar functions applied elementwise: `log`, `exp`, `entr`, `power`, `logistic`, `xexp`, trigonometric (`sin`, `cos`, `tan`), hyperbolic (`sinh`, `tanh`, `asinh`, `atanh`). Uses `common.c` for shared chain-rule patterns.
+- **`bivariate/`** - Two-argument operations: `multiply`, `quad_over_lin`, `rel_entr`, `const_scalar_mult`, `const_vector_mult`, `left_matmul` (A @ f(x)), `right_matmul` (f(x) @ A)
+- **`other/`** - Special atoms: `quad_form` (x'Px), `prod` (product of elements)
 
-Each atom implements its own `forward`, `jacobian_init`, `eval_jacobian`, and `eval_wsum_hess` functions following a consistent pattern.
+Each atom implements: `forward`, `jacobian_init`, `eval_jacobian`, and optionally `eval_wsum_hess` (defaults to zero for affine atoms).
 
 ### Problem Struct
 
@@ -95,12 +95,14 @@ The Python package `dnlp_diff_engine` (in `src/dnlp_diff_engine/`) provides:
 
 ### Derivative Computation Flow
 
-1. Call `problem_init_derivatives()` to allocate Jacobian/Hessian storage and compute sparsity patterns
-2. Call forward pass (`objective_forward` / `constraint_forward`) to propagate values through tree
-3. Call derivative functions (`gradient`, `jacobian`, `hessian`) which traverse tree computing derivatives
+1. **Initialization**: `problem_init_derivatives()` allocates storage and computes sparsity patterns for all Jacobians and Hessians. This is done once per problem.
+2. **Forward pass**: `objective_forward(u)` / `constraint_forward(u)` propagate values through expression tree, storing results in each node's `value` field.
+3. **Derivative computation**: `gradient()`, `jacobian()`, `hessian()` traverse tree computing derivatives via chain rule.
 
-Jacobian uses chain rule: each node computes local Jacobian, combined via sparse matrix operations.
-Hessian computes weighted sum: `obj_w * H_obj + sum(lambda_i * H_constraint_i)`
+**Key invariant**: Forward pass must be called before corresponding derivative functions. The derivatives are computed using values cached during forward pass.
+
+Jacobian uses chain rule: `J_composite = J_outer * J_inner` via sparse matrix operations.
+Hessian computes weighted sum: `obj_w * H_obj + sum(lambda_i * H_constraint_i)`, returning lower triangular.
 
 ### Sparse Matrix Utilities
 
@@ -110,11 +112,11 @@ Hessian computes weighted sum: `obj_w * H_obj + sum(lambda_i * H_constraint_i)`
 
 - `include/` - Header files defining public API (`expr.h`, `problem.h`, atom headers)
 - `src/` - C implementation files organized by atom category
-- `src/dnlp_diff_engine/` - Python package with high-level API
+- `src/dnlp_diff_engine/` - Python package with high-level API (`__init__.py` contains `C_problem` class and `ATOM_CONVERTERS`)
 - `python/` - Python bindings C code (`bindings.c`)
 - `python/atoms/` - Python binding headers for each atom type
 - `python/problem/` - Python binding headers for problem interface
-- `python/tests/` - Python integration tests (run via pytest)
+- `python/tests/` - Python integration tests (run via pytest): `test_unconstrained.py`, `test_constrained.py`, `test_problem_native.py`
 - `tests/` - C tests using minunit framework
 - `tests/forward_pass/` - Forward evaluation tests (C)
 - `tests/jacobian_tests/` - Jacobian correctness tests (C)
@@ -131,6 +133,12 @@ Hessian computes weighted sum: `obj_w * H_obj + sum(lambda_i * H_constraint_i)`
 7. Rebuild: `pip install -e .`
 8. Add tests in `tests/` (C) and `tests/python/` (Python)
 
+## Known Limitations
+
+- **Bivariate matmul not supported**: `f(x) @ g(x)` where both sides depend on variables is not implemented. Only `A @ f(x)` and `f(x) @ A` with constant A work.
+- **Reshape order**: Only Fortran order (`order='F'`) is supported. C order would require permutation logic.
+- **Hessian sparsity**: Some atoms (`hstack`, `trace`) don't compute hessian sparsity patterns during initialization (see TODO.md).
+
 ## License Header
 
 ```c
diff --git a/pyproject.toml b/pyproject.toml
index fc4c8ae..37c735f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,18 +5,18 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "dnlp-diff-engine"
 version = "0.1.0"
-description = "Automatic differentiation engine for DNLP optimization problems"
+description = "Low-level C autodiff engine for nonlinear optimization"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "numpy",
-    "scipy",
-    "cvxpy",
 ]
 
 [project.optional-dependencies]
 test = [
     "pytest>=7.0",
+    "cvxpy",  # For integration tests
+    "scipy",  # Required by cvxpy integration
 ]
 dev = [
     "pytest>=7.0",
diff --git a/python/tests/test_constrained.py b/python/tests/test_constrained.py
index f7a933b..3b21897 100644
--- a/python/tests/test_constrained.py
+++ b/python/tests/test_constrained.py
@@ -3,7 +3,7 @@
 import cvxpy as cp
 import numpy as np
 
-from dnlp_diff_engine import C_problem
+from cvxpy.reductions.solvers.nlp_solvers.diff_engine import C_problem
 
 # Note: CVXPY converts constraints A >= B to B - A <= 0
 # So constr.expr for "log(x) >= 0" is "0 - log(x)" = -log(x)
diff --git a/python/tests/test_unconstrained.py b/python/tests/test_unconstrained.py
index bcb0c54..c50e41c 100644
--- a/python/tests/test_unconstrained.py
+++ b/python/tests/test_unconstrained.py
@@ -3,7 +3,7 @@
 import cvxpy as cp
 import numpy as np
 
-from dnlp_diff_engine import C_problem
+from cvxpy.reductions.solvers.nlp_solvers.diff_engine import C_problem
 
 
 def test_sum_log():
diff --git a/src/dnlp_diff_engine/__init__.py b/src/dnlp_diff_engine/__init__.py
index 4b8efb1..0e6db19 100644
--- a/src/dnlp_diff_engine/__init__.py
+++ b/src/dnlp_diff_engine/__init__.py
@@ -1,373 +1,52 @@
 """
-DNLP Diff Engine - Automatic differentiation for nonlinear optimization.
+DNLP Diff Engine - Low-level C bindings for automatic differentiation.
 
-This package provides automatic differentiation capabilities for CVXPY problems,
-computing gradients, Jacobians, and Hessians needed by NLP solvers.
+This package provides the raw C extension for building expression trees
+and computing derivatives. For CVXPY integration, use:
+    from cvxpy.reductions.solvers.nlp_solvers.diff_engine import C_problem
 """
 
-import cvxpy as cp
-import numpy as np
-from cvxpy.reductions.inverse_data import InverseData
-from scipy import sparse
-
-from . import _core as _diffengine
-
-__all__ = ["C_problem", "convert_problem", "build_variable_dict"]
-
-
-def _chain_add(children):
-    """Chain multiple children with binary adds: a + b + c -> add(add(a, b), c)."""
-    result = children[0]
-    for child in children[1:]:
-        result = _diffengine.make_add(result, child)
-    return result
-
-
-def _convert_matmul(expr, children):
-    """Convert matrix multiplication A @ f(x) or f(x) @ A."""
-    # MulExpression has args: [left, right]
-    # One of them should be a Constant, the other a variable expression
-    left_arg, right_arg = expr.args
-
-    if left_arg.is_constant():
-        # A @ f(x) -> left_matmul
-        # TODO: why is this always dense? What's going on here?
-        # we later convert it to csr....
-        A = np.asarray(left_arg.value, dtype=np.float64)
-        if A.ndim == 1:
-            A = A.reshape(1, -1)  # Convert 1D to row vector
-        A_csr = sparse.csr_matrix(A)
-        m, n = A_csr.shape
-        return _diffengine.make_left_matmul(
-            children[1],  # right child is the variable expression
-            A_csr.data.astype(np.float64),
-            A_csr.indices.astype(np.int32),
-            A_csr.indptr.astype(np.int32),
-            m,
-            n,
-        )
-    elif right_arg.is_constant():
-        # f(x) @ A -> right_matmul
-
-        # TODO: why is this always dense? What's going on here?
-        # we later convert it to csr....
-        A = np.asarray(right_arg.value, dtype=np.float64)
-        if A.ndim == 1:
-            A = A.reshape(-1, 1)  # Convert 1D to column vector
-        A_csr = sparse.csr_matrix(A)
-        m, n = A_csr.shape
-        return _diffengine.make_right_matmul(
-            children[0],  # left child is the variable expression
-            A_csr.data.astype(np.float64),
-            A_csr.indices.astype(np.int32),
-            A_csr.indptr.astype(np.int32),
-            m,
-            n,
-        )
-    else:
-        raise NotImplementedError("MulExpression with two non-constant args not supported")
-
-
-def _convert_multiply(expr, children):
-    """Convert multiplication based on argument types."""
-    # multiply has args: [left, right]
-    left_arg, right_arg = expr.args
-
-    # Check if left is a constant
-    if left_arg.is_constant():
-        value = np.asarray(left_arg.value, dtype=np.float64)
-
-        # Scalar constant
-        if value.size == 1:
-            scalar = float(value.flat[0])
-            return _diffengine.make_const_scalar_mult(children[1], scalar)
-
-        # Vector constant
-        if value.ndim == 1 or (value.ndim == 2 and min(value.shape) == 1):
-            vector = value.flatten()
-            return _diffengine.make_const_vector_mult(children[1], vector)
-
-    # Check if right is a constant
-    elif right_arg.is_constant():
-        value = np.asarray(right_arg.value, dtype=np.float64)
-
-        # Scalar constant
-        if value.size == 1:
-            scalar = float(value.flat[0])
-            return _diffengine.make_const_scalar_mult(children[0], scalar)
-
-        # Vector constant
-        if value.ndim == 1 or (value.ndim == 2 and min(value.shape) == 1):
-            vector = value.flatten()
-            return _diffengine.make_const_vector_mult(children[0], vector)
-
-    # Neither is constant, use general multiply
-    return _diffengine.make_multiply(children[0], children[1])
-
-
-def _extract_flat_indices_from_index(expr):
-    """Extract flattened indices from CVXPY index expression."""
-    parent_shape = expr.args[0].shape
-    indices_per_dim = [np.arange(s.start, s.stop, s.step) for s in expr.key]
-
-    if len(indices_per_dim) == 1:
-        return indices_per_dim[0].astype(np.int32)
-    elif len(indices_per_dim) == 2:
-        # Fortran order: idx = row + col * n_rows
-        return (
-            np.add.outer(indices_per_dim[0], indices_per_dim[1] * parent_shape[0])
-            .flatten(order="F")
-            .astype(np.int32)
-        )
-    else:
-        raise NotImplementedError("index with >2 dimensions not supported")
-
-
-def _extract_flat_indices_from_special_index(expr):
-    """Extract flattened indices from CVXPY special_index expression."""
-    return np.reshape(expr._select_mat, expr._select_mat.size, order="F").astype(np.int32)
-
-
-def _convert_rel_entr(_expr, children):
-    """Convert rel_entr(x, y) = x * log(x/y) elementwise."""
-    return _diffengine.make_rel_entr(children[0], children[1])
-
-
-def _convert_quad_form(expr, children):
-    """Convert quadratic form x.T @ P @ x."""
-
-    P_arg = expr.args[1]
-
-    if not isinstance(P_arg, cp.Constant):
-        raise NotImplementedError("quad_form requires P to be a constant matrix")
-
-    P = np.asarray(P_arg.value, dtype=np.float64)
-    if P.ndim == 1:
-        P = P.reshape(-1, 1)
-
-    P_csr = sparse.csr_matrix(P)
-    m, n = P_csr.shape
-
-    return _diffengine.make_quad_form(
-        children[0],  # x expression
-        P_csr.data.astype(np.float64),
-        P_csr.indices.astype(np.int32),
-        P_csr.indptr.astype(np.int32),
-        m,
-        n,
-    )
-
-
-def _convert_reshape(expr, children):
-    """Convert reshape - only Fortran order is supported.
-
-    For Fortran order, reshape is a no-op since the underlying data layout
-    is unchanged. We create a reshape node that updates dimension metadata.
-
-    Note: Only order='F' (Fortran/column-major) is supported. This is the
-    default and most common case in CVXPY. C order would require permutation.
-    """
-    if expr.order != "F":
-        raise NotImplementedError(
-            f"reshape with order='{expr.order}' not supported. "
-            "Only order='F' (Fortran) is currently supported."
-        )
-
-    d1, d2 = expr.shape
-    return _diffengine.make_reshape(children[0], d1, d2)
-
-
-# Mapping from CVXPY atom names to C diff engine functions
-# Converters receive (expr, children) where expr is the CVXPY expression
-ATOM_CONVERTERS = {
-    # Elementwise unary
-    "log": lambda _expr, children: _diffengine.make_log(children[0]),
-    "exp": lambda _expr, children: _diffengine.make_exp(children[0]),
-    # Affine unary
-    "NegExpression": lambda _expr, children: _diffengine.make_neg(children[0]),
-    "Promote": lambda expr, children: _diffengine.make_promote(
-        children[0],
-        expr.shape[0] if len(expr.shape) >= 1 else 1,
-        expr.shape[1] if len(expr.shape) >= 2 else 1,
-    ),
-    # N-ary (handles 2+ args)
-    "AddExpression": lambda _expr, children: _chain_add(children),
-    # Reductions
-    "Sum": lambda _expr, children: _diffengine.make_sum(children[0], -1),
-    # Bivariate
-    "multiply": _convert_multiply,
-    "QuadForm": _convert_quad_form,
-    "quad_over_lin": lambda _expr, children: _diffengine.make_quad_over_lin(
-        children[0], children[1]
-    ),
-    "rel_entr": _convert_rel_entr,
-    # Matrix multiplication
-    "MulExpression": _convert_matmul,
-    # Elementwise univariate with parameter
-    "power": lambda expr, children: _diffengine.make_power(children[0], float(expr.p.value)),
-    # Trigonometric
-    "sin": lambda _expr, children: _diffengine.make_sin(children[0]),
-    "cos": lambda _expr, children: _diffengine.make_cos(children[0]),
-    "tan": lambda _expr, children: _diffengine.make_tan(children[0]),
-    # Hyperbolic
-    "sinh": lambda _expr, children: _diffengine.make_sinh(children[0]),
-    "tanh": lambda _expr, children: _diffengine.make_tanh(children[0]),
-    "asinh": lambda _expr, children: _diffengine.make_asinh(children[0]),
-    "atanh": lambda _expr, children: _diffengine.make_atanh(children[0]),
-    # Other elementwise
-    "entr": lambda _expr, children: _diffengine.make_entr(children[0]),
-    "logistic": lambda _expr, children: _diffengine.make_logistic(children[0]),
-    "xexp": lambda _expr, children: _diffengine.make_xexp(children[0]),
-    # Indexing/slicing
-    "index": lambda expr, children: _diffengine.make_index(
-        children[0], _extract_flat_indices_from_index(expr)
-    ),
-    "special_index": lambda expr, children: _diffengine.make_index(
-        children[0], _extract_flat_indices_from_special_index(expr)
-    ),
-    "reshape": _convert_reshape,
-    # Reductions returning scalar
-    "Prod": lambda _expr, children: _diffengine.make_prod(children[0]),
-}
-
-
-def build_variable_dict(variables: list) -> tuple[dict, int]:
-    """
-    Build dictionary mapping CVXPY variable ids to C variables.
-
-    Args:
-        variables: list of CVXPY Variable objects
-
-    Returns:
-        var_dict: {var.id: c_variable} mapping
-        n_vars: total number of scalar variables
-    """
-    id_map, _, n_vars, var_shapes = InverseData.get_var_offsets(variables)
-
-    var_dict = {}
-    for var in variables:
-        offset, _ = id_map[var.id]
-        shape = var_shapes[var.id]
-        if len(shape) == 2:
-            d1, d2 = shape[0], shape[1]
-        elif len(shape) == 1:
-            d1, d2 = shape[0], 1
-        else:  # scalar
-            d1, d2 = 1, 1
-        c_var = _diffengine.make_variable(d1, d2, offset, n_vars)
-        var_dict[var.id] = c_var
-
-    return var_dict, n_vars
-
-
-def _convert_expr(expr, var_dict: dict, n_vars: int):
-    """Convert CVXPY expression using pre-built variable dictionary."""
-    # Base case: variable lookup
-    if isinstance(expr, cp.Variable):
-        return var_dict[expr.id]
-
-    # Base case: constant
-    if isinstance(expr, cp.Constant):
-        value = np.asarray(expr.value, dtype=np.float64).flatten()
-        d1 = expr.shape[0] if len(expr.shape) >= 1 else 1
-        d2 = expr.shape[1] if len(expr.shape) >= 2 else 1
-        return _diffengine.make_constant(d1, d2, n_vars, value)
-
-    # Recursive case: atoms
-    atom_name = type(expr).__name__
-
-    if atom_name in ATOM_CONVERTERS:
-        children = [_convert_expr(arg, var_dict, n_vars) for arg in expr.args]
-        return ATOM_CONVERTERS[atom_name](expr, children)
-
-    raise NotImplementedError(f"Atom '{atom_name}' not supported")
-
-
-def convert_expressions(problem: cp.Problem) -> tuple:
-    """
-    Convert CVXPY Problem to C expressions (low-level).
-
-    Args:
-        problem: CVXPY Problem object
-
-    Returns:
-        c_objective: C expression for objective
-        c_constraints: list of C expressions for constraints
-    """
-    var_dict, n_vars = build_variable_dict(problem.variables())
-
-    # Convert objective
-    c_objective = _convert_expr(problem.objective.expr, var_dict, n_vars)
-
-    # Convert constraints (expression part only for now)
-    c_constraints = []
-    for constr in problem.constraints:
-        c_expr = _convert_expr(constr.expr, var_dict, n_vars)
-        c_constraints.append(c_expr)
-
-    return c_objective, c_constraints
-
-
-def convert_problem(problem: cp.Problem) -> "C_problem":
-    """
-    Convert CVXPY Problem to C problem struct.
-
-    Args:
-        problem: CVXPY Problem object
-
-    Returns:
-        C_Problem wrapper around C problem struct
-    """
-    return C_problem(problem)
-
-
-class C_problem:
-    """Wrapper around C problem struct for CVXPY problems."""
-
-    def __init__(self, cvxpy_problem: cp.Problem):
-        var_dict, n_vars = build_variable_dict(cvxpy_problem.variables())
-        c_obj = _convert_expr(cvxpy_problem.objective.expr, var_dict, n_vars)
-        c_constraints = [_convert_expr(c.expr, var_dict, n_vars) for c in cvxpy_problem.constraints]
-        self._capsule = _diffengine.make_problem(c_obj, c_constraints)
-        self._allocated = False
-
-    def init_derivatives(self):
-        """Initialize derivative structures. Must be called before forward/gradient/jacobian."""
-        _diffengine.problem_init_derivatives(self._capsule)
-        self._allocated = True
-
-    def objective_forward(self, u: np.ndarray) -> float:
-        """Evaluate objective. Returns obj_value float."""
-        return _diffengine.problem_objective_forward(self._capsule, u)
-
-    def constraint_forward(self, u: np.ndarray) -> np.ndarray:
-        """Evaluate constraints only. Returns constraint_values array."""
-        return _diffengine.problem_constraint_forward(self._capsule, u)
-
-    def gradient(self) -> np.ndarray:
-        """Compute gradient of objective. Call objective_forward first. Returns gradient array."""
-        return _diffengine.problem_gradient(self._capsule)
-
-    def jacobian(self) -> sparse.csr_matrix:
-        """Compute constraint Jacobian. Call constraint_forward first."""
-        data, indices, indptr, shape = _diffengine.problem_jacobian(self._capsule)
-        return sparse.csr_matrix((data, indices, indptr), shape=shape)
-
-    def hessian(self, obj_factor: float, lagrange: np.ndarray) -> sparse.csr_matrix:
-        """Compute Lagrangian Hessian.
-
-        Computes: obj_factor * H_obj + sum(lagrange_i * H_constraint_i)
-
-        Call objective_forward and constraint_forward before this.
-
-        Args:
-            obj_factor: Weight for objective Hessian
-            lagrange: Array of Lagrange multipliers (length = total_constraint_size)
-
-        Returns:
-            scipy CSR matrix of shape (n_vars, n_vars)
-        """
-        data, indices, indptr, shape = _diffengine.problem_hessian(
-            self._capsule, obj_factor, lagrange
-        )
-        return sparse.csr_matrix((data, indices, indptr), shape=shape)
+# Re-export all C functions directly from the _core extension
+from ._core import *  # noqa: F401, F403
+
+__all__ = [
+    # Atom constructors
+    "make_variable",
+    "make_constant",
+    "make_add",
+    "make_neg",
+    "make_sum",
+    "make_promote",
+    "make_index",
+    "make_reshape",
+    "make_log",
+    "make_exp",
+    "make_power",
+    "make_entr",
+    "make_logistic",
+    "make_xexp",
+    "make_sin",
+    "make_cos",
+    "make_tan",
+    "make_sinh",
+    "make_tanh",
+    "make_asinh",
+    "make_atanh",
+    "make_multiply",
+    "make_const_scalar_mult",
+    "make_const_vector_mult",
+    "make_left_matmul",
+    "make_right_matmul",
+    "make_quad_form",
+    "make_quad_over_lin",
+    "make_rel_entr",
+    "make_prod",
+    # Problem interface
+    "make_problem",
+    "problem_init_derivatives",
+    "problem_objective_forward",
+    "problem_constraint_forward",
+    "problem_gradient",
+    "problem_jacobian",
+    "problem_hessian",
+]