From 7490e5d8ff9e461107e25c4d81ebedb1cf2997bb Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 05:08:13 -0700
Subject: [PATCH 01/11] Add backend operation proxies

---
 klongpy/backend.py                | 166 ++++++++++++-----------------
 klongpy/backends/__init__.py      |  10 ++
 klongpy/backends/numpy_backend.py | 167 ++++++++++++++++++++++++++++++
 klongpy/backends/torch_backend.py |  95 +++++++++++++++++
 tests/test_backend.py             |  47 +++++++++
 5 files changed, 385 insertions(+), 100 deletions(-)
 create mode 100644 klongpy/backends/__init__.py
 create mode 100644 klongpy/backends/numpy_backend.py
 create mode 100644 klongpy/backends/torch_backend.py
 create mode 100644 tests/test_backend.py

diff --git a/klongpy/backend.py b/klongpy/backend.py
index c624928..d643480 100644
--- a/klongpy/backend.py
+++ b/klongpy/backend.py
@@ -1,103 +1,69 @@
-import os
-import warnings
-
-# Attempt to import CuPy. If not available, set use_gpu to False.
-use_gpu = bool(os.environ.get('USE_GPU') == '1')
-if use_gpu:
-    try:
-        import cupy as np
-        use_gpu = True
-    except ImportError:
-        import numpy as np
-        use_gpu = False
-else:
-    import numpy as np
-
-
-def is_supported_type(x):
-    """
-    CuPy does not support strings or jagged arrays.
-    Note: add any other unsupported types here.
-    """
-    if isinstance(x, str) or is_jagged_array(x):
-        return False
-    return True
+"""Backend selection utilities for klongpy."""
 
+from importlib import import_module
+from typing import Any
+import numpy as _np
 
-def is_jagged_array(x):
-    """
-    Check if an array is jagged.
+# default numpy compatibility shim for legacy modules
+_np.seterr(divide="ignore")
+_np.isarray = lambda x: isinstance(x, _np.ndarray)
+np = _np
+
+BACKEND = "numpy"
+
+
+def current():
+    """Return the currently selected backend module."""
+    return import_module(f"klongpy.backends.{BACKEND}_backend")
+
+
+def set_backend(name: str) -> None:
+    """Select the computation backend.
+
+    Parameters
+    ----------
+    name: str
+        Either ``"numpy"`` or ``"torch"``.
     """
-    if isinstance(x, list):
-        # If the lengths of sublists vary, it's a jagged array.
-        return len(set(map(len, x))) > 1
-    return False
-
-if use_gpu:
-    import cupy
-    import numpy
-
-    class CuPyReductionKernelWrapper:
-        def __init__(self, fn, reduce_fn_1, reduce_fn_2):
-            self.fn = fn
-            self.reduce_fn_1 = reduce_fn_1
-            self.reduce_fn_2 = reduce_fn_2
-
-        def __call__(self, *args, **kwargs):
-            return self.fn(*args, **kwargs)
-
-        def reduce(self, x):
-            return self.reduce_fn_1(x) if x.ndim == 1 else self.reduce_fn_2(x[0], x[1])
-
-    add_reduce_2 = cupy.ElementwiseKernel(
-            'T x, T y',
-            'T z',
-            'z = (x + y)',
-            'add_reduce_2')
-    np.add = CuPyReductionKernelWrapper(cupy.add, cupy.sum, add_reduce_2)
-
-    def subtract_reduce_1(x):
-        return 2*x[0] - cupy.sum(x)
-
-    subtract_reduce_2 = cupy.ElementwiseKernel(
-            'T x, T y',
-            'T z',
-            'z = (x - y)',
-            'subtract_reduce_2')
-    np.subtract = CuPyReductionKernelWrapper(cupy.subtract, subtract_reduce_1, subtract_reduce_2)
-
-    multiply_reduce_1 = cupy.ReductionKernel(
-                'T x',
-                'T y',
-                'x',
-                'a * b',
-                'y = a',
-                '1',
-                'multiply_reduce_1'
-             )
-    multiply_reduce_2 = cupy.ElementwiseKernel(
-            'T x, T y',
-            'T z',
-            'z = (x * y)',
-            'multiply_reduce_2')
-    np.multiply = CuPyReductionKernelWrapper(cupy.multiply, multiply_reduce_1, multiply_reduce_2)
-
-    def divide_reduce_1(x):
-        raise NotImplementedError()
-
-    divide_reduce_2 = cupy.ElementwiseKernel(
-            'T x, T y',
-            'T z',
-            'z = (x / y)',
-            'divide_reduce_2')
-    np.divide = CuPyReductionKernelWrapper(cupy.divide, divide_reduce_1, divide_reduce_2)
-
-    np.isarray = lambda x: isinstance(x, (numpy.ndarray, cupy.ndarray))
-
-#    np.hstack = lambda x: cupy.hstack(x) if use_gpu and is_supported_type(x) else numpy.hstack(x)
-else:
-    np.seterr(divide='ignore')
-    warnings.filterwarnings("error", category=np.VisibleDeprecationWarning)
-    np.isarray = lambda x: isinstance(x, np.ndarray)
-
-np
+    global BACKEND
+    name = name.lower()
+    if name not in {"numpy", "torch"}:
+        raise ValueError(f"unknown backend '{name}'")
+    if name == "torch":
+        import_module("klongpy.backends.torch_backend")
+    BACKEND = name
+
+
+def array(obj: Any, *, dtype: Any | None = None, requires_grad: bool = False) -> Any:
+    """Create an array or tensor using the active backend."""
+    return current().array(obj, dtype=dtype, requires_grad=requires_grad)
+
+
+def add(a: Any, b: Any) -> Any:
+    """Element-wise addition via the active backend."""
+    return current().add(a, b)
+
+
+def mul(a: Any, b: Any) -> Any:
+    """Element-wise multiplication via the active backend."""
+    return current().mul(a, b)
+
+
+def matmul(a: Any, b: Any) -> Any:
+    """Matrix multiplication via the active backend."""
+    return current().matmul(a, b)
+
+
+def sum(a: Any, axis: int | None = None) -> Any:
+    """Sum elements of ``a`` via the active backend."""
+    return current().sum(a, axis=axis)
+
+
+def grad(fn: Any, wrt: int = 0) -> Any:
+    """Return gradient function via the active backend."""
+    return current().grad(fn, wrt=wrt)
+
+
+def stop(x: Any) -> Any:
+    """Detach ``x`` from the autograd graph via the active backend."""
+    return current().stop(x)
diff --git a/klongpy/backends/__init__.py b/klongpy/backends/__init__.py
new file mode 100644
index 0000000..af9077e
--- /dev/null
+++ b/klongpy/backends/__init__.py
@@ -0,0 +1,10 @@
+"""Backend implementations."""
+
+from . import numpy_backend
+
+try:
+    from . import torch_backend  # noqa: F401
+except Exception:  # torch may not be available
+    torch_backend = None
+
+__all__ = ["numpy_backend", "torch_backend"]
diff --git a/klongpy/backends/numpy_backend.py b/klongpy/backends/numpy_backend.py
new file mode 100644
index 0000000..d36e933
--- /dev/null
+++ b/klongpy/backends/numpy_backend.py
@@ -0,0 +1,167 @@
+"""NumPy backend with minimal reverse-mode autodiff."""
+
+from __future__ import annotations
+
+import numpy as np
+from typing import Any, Callable, Iterable, Optional
+
+
+class Tensor:
+    """Simple tensor wrapper for reverse-mode autodiff."""
+
+    def __init__(self, data: np.ndarray, _children: Iterable[Tensor] = (), requires_grad: bool = False):
+        self.data = np.asarray(data)
+        if self.data.dtype == object:
+            raise TypeError("object dtype not supported")
+        self.requires_grad = requires_grad
+        self.grad: Optional[np.ndarray] = np.zeros_like(self.data, dtype=self.data.dtype) if requires_grad else None
+        self._prev = set(_children)
+        self._backward: Callable[[], None] = lambda: None
+
+    def __repr__(self) -> str:  # pragma: no cover - debug helper
+        return f"Tensor(data={self.data}, grad={self.grad})"
+
+
+def array(obj: Any, *, dtype: Any | None = None, requires_grad: bool = False) -> Any:
+    """Create an array or tensor."""
+    if isinstance(obj, Tensor):
+        data = obj.data if dtype is None else obj.data.astype(dtype)
+        return Tensor(data, requires_grad=requires_grad or obj.requires_grad)
+    arr = np.asarray(obj, dtype=dtype)
+    if arr.dtype == object:
+        raise TypeError("object dtype not supported")
+    return Tensor(arr, requires_grad=requires_grad) if requires_grad else arr
+
+
+def _ensure_tensor(x: Any) -> Tensor:
+    if isinstance(x, Tensor):
+        return x
+    arr = np.asarray(x)
+    if arr.dtype == object:
+        raise TypeError("object dtype not supported")
+    return Tensor(arr)
+
+
+def _broadcast_grad(grad: np.ndarray, shape: tuple[int, ...]) -> np.ndarray:
+    g = grad
+    while len(g.shape) > len(shape):
+        g = g.sum(axis=0)
+    for i, dim in enumerate(shape):
+        if dim == 1:
+            g = g.sum(axis=i, keepdims=True)
+    return g
+
+
+def add(a: Any, b: Any) -> Any:
+    """Element-wise addition."""
+    if not isinstance(a, Tensor) and not isinstance(b, Tensor):
+        return np.add(a, b)
+    ta, tb = _ensure_tensor(a), _ensure_tensor(b)
+    out = Tensor(ta.data + tb.data, (ta, tb), requires_grad=ta.requires_grad or tb.requires_grad)
+
+    def _backward() -> None:
+        if out.grad is None:
+            return
+        if ta.requires_grad:
+            ta.grad += _broadcast_grad(out.grad, ta.data.shape)
+        if tb.requires_grad:
+            tb.grad += _broadcast_grad(out.grad, tb.data.shape)
+
+    out._backward = _backward
+    return out
+
+
+def mul(a: Any, b: Any) -> Any:
+    """Element-wise multiplication."""
+    if not isinstance(a, Tensor) and not isinstance(b, Tensor):
+        return np.multiply(a, b)
+    ta, tb = _ensure_tensor(a), _ensure_tensor(b)
+    out = Tensor(ta.data * tb.data, (ta, tb), requires_grad=ta.requires_grad or tb.requires_grad)
+
+    def _backward() -> None:
+        if out.grad is None:
+            return
+        if ta.requires_grad:
+            ta.grad += _broadcast_grad(out.grad * tb.data, ta.data.shape)
+        if tb.requires_grad:
+            tb.grad += _broadcast_grad(out.grad * ta.data, tb.data.shape)
+
+    out._backward = _backward
+    return out
+
+
+def matmul(a: Any, b: Any) -> Any:
+    """Matrix multiplication."""
+    if not isinstance(a, Tensor) and not isinstance(b, Tensor):
+        return np.matmul(a, b)
+    ta, tb = _ensure_tensor(a), _ensure_tensor(b)
+    out = Tensor(ta.data @ tb.data, (ta, tb), requires_grad=ta.requires_grad or tb.requires_grad)
+
+    def _backward() -> None:
+        if out.grad is None:
+            return
+        if ta.requires_grad:
+            ta.grad += out.grad @ tb.data.T
+        if tb.requires_grad:
+            tb.grad += ta.data.T @ out.grad
+
+    out._backward = _backward
+    return out
+
+
+def sum(a: Any, axis: int | None = None) -> Any:
+    """Sum of elements."""
+    if not isinstance(a, Tensor):
+        return np.sum(a, axis=axis)
+    out = Tensor(np.sum(a.data, axis=axis), (a,), requires_grad=a.requires_grad)
+
+    def _backward() -> None:
+        if out.grad is None or not a.requires_grad:
+            return
+        grad = out.grad
+        if axis is None:
+            grad = np.broadcast_to(grad, a.data.shape)
+        else:
+            grad = np.expand_dims(grad, axis)
+            grad = np.broadcast_to(grad, a.data.shape)
+        a.grad += grad
+
+    out._backward = _backward
+    return out
+
+
+def stop(x: Any) -> Any:
+    """Detach ``x`` from the autograd graph."""
+    if isinstance(x, Tensor):
+        return Tensor(x.data.copy(), requires_grad=False)
+    return x
+
+
+def grad(fn: Callable[..., Any], wrt: int = 0) -> Callable[..., Any]:
+    """Return a function computing ``∂fn/∂arg[wrt]``."""
+
+    def _grad_fn(*args: Any) -> Any:
+        targs = []
+        for i, a in enumerate(args):
+            t = array(a, requires_grad=(i == wrt))
+            targs.append(t)
+        out = fn(*targs)
+        if not isinstance(out, Tensor):
+            raise RuntimeError("not differentiable")
+        out.grad = np.ones_like(out.data)
+        topo: list[Tensor] = []
+        visited: set[Tensor] = set()
+
+        def build(v: Tensor) -> None:
+            if v not in visited:
+                visited.add(v)
+                for child in v._prev:
+                    build(child)
+                topo.append(v)
+
+        build(out)
+        for node in reversed(topo):
+            node._backward()
+        return targs[wrt].grad
+
+    return _grad_fn
diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
new file mode 100644
index 0000000..82c8e4c
--- /dev/null
+++ b/klongpy/backends/torch_backend.py
@@ -0,0 +1,95 @@
+"""Torch backend using NumPy fallback for strings."""
+
+from __future__ import annotations
+
+from typing import Any, Callable
+from importlib import import_module
+
+try:
+    import torch
+except Exception as e:  # pragma: no cover - optional dependency
+    raise ImportError("torch backend requires the 'torch' package") from e
+
+import numpy as np
+from . import numpy_backend as npb
+
+
+def _contains_strings(x: Any) -> bool:
+    if isinstance(x, str):
+        return True
+    if isinstance(x, (list, tuple)):
+        return any(_contains_strings(i) for i in x)
+    if isinstance(x, np.ndarray):
+        return x.dtype.kind in {"U", "S", "O"}
+    return False
+
+
+def _to_numpy(x: Any) -> Any:
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().numpy()
+    return x
+
+
+def array(obj: Any, *, dtype: Any | None = None, requires_grad: bool = False) -> Any:
+    """Create a torch tensor or numpy array."""
+    if _contains_strings(obj):
+        return npb.array(obj, dtype=dtype, requires_grad=requires_grad)
+    return torch.tensor(obj, dtype=dtype, requires_grad=requires_grad)
+
+
+def _torchify(x: Any) -> torch.Tensor:
+    if isinstance(x, torch.Tensor):
+        return x
+    return torch.tensor(x)
+
+
+def add(a: Any, b: Any) -> Any:
+    if _contains_strings(a) or _contains_strings(b):
+        return npb.add(_to_numpy(a), _to_numpy(b))
+    return _torchify(a) + _torchify(b)
+
+
+def mul(a: Any, b: Any) -> Any:
+    if _contains_strings(a) or _contains_strings(b):
+        return npb.mul(_to_numpy(a), _to_numpy(b))
+    return _torchify(a) * _torchify(b)
+
+
+def matmul(a: Any, b: Any) -> Any:
+    if _contains_strings(a) or _contains_strings(b):
+        return npb.matmul(_to_numpy(a), _to_numpy(b))
+    return _torchify(a) @ _torchify(b)
+
+
+def sum(a: Any, axis: int | None = None) -> Any:
+    if _contains_strings(a):
+        return npb.sum(_to_numpy(a), axis=axis)
+    return torch.sum(_torchify(a), dim=axis)
+
+
+def stop(x: Any) -> Any:
+    if isinstance(x, torch.Tensor):
+        return x.detach()
+    return npb.stop(x)
+
+
+def grad(fn: Callable[..., Any], wrt: int = 0) -> Callable[..., Any]:
+    """Return a function computing ``∂fn/∂arg[wrt]`` using torch.autograd."""
+
+    def _grad_fn(*args: Any) -> Any:
+        if any(_contains_strings(a) for a in args):
+            raise RuntimeError("not differentiable")
+        targs = []
+        for i, a in enumerate(args):
+            if isinstance(a, torch.Tensor):
+                t = a.clone().detach().requires_grad_(i == wrt)
+            else:
+                t = torch.tensor(a, dtype=torch.float64, requires_grad=(i == wrt))
+            targs.append(t)
+        out = fn(*targs)
+        if not isinstance(out, torch.Tensor):
+            raise RuntimeError("not differentiable")
+        g, = torch.autograd.grad(out, targs[wrt])
+        return g
+
+    return _grad_fn
diff --git a/tests/test_backend.py b/tests/test_backend.py
new file mode 100644
index 0000000..8630ab3
--- /dev/null
+++ b/tests/test_backend.py
@@ -0,0 +1,47 @@
+import unittest
+import numpy as np
+
+from klongpy import backend
+
+
+class TestBackend(unittest.TestCase):
+    def _check_grad(self, name: str):
+        try:
+            backend.set_backend(name)
+        except ImportError:
+            raise unittest.SkipTest(f"{name} backend not available")
+        b = backend.current()
+
+        def f(x):
+            return b.sum(b.mul(b.add(x, 1), b.add(x, 1)))
+
+        g = b.grad(f)
+        x = b.array([1.0, 2.0, 3.0], requires_grad=True)
+        grad = g(x)
+        if hasattr(grad, "detach"):
+            grad = grad.detach().cpu().numpy()
+        np.testing.assert_allclose(np.array(grad), np.array([4.0, 6.0, 8.0]))
+
+    def test_grad_numpy(self):
+        self._check_grad("numpy")
+
+    def test_grad_torch(self):
+        self._check_grad("torch")
+
+    def test_strings_not_differentiable(self):
+        try:
+            backend.set_backend("torch")
+        except ImportError:
+            raise unittest.SkipTest("torch backend not available")
+        b = backend.current()
+
+        def f(x):
+            return b.add(x, ["a", "b", "c"])
+
+        x = b.array([1.0, 2.0, 3.0], requires_grad=True)
+        with self.assertRaisesRegex(RuntimeError, "not differentiable"):
+            b.grad(f)(x)
+
+
+if __name__ == "__main__":
+    unittest.main()

From c5c946ec32fee1076b8db1f01c371bbac47d803d Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 12:48:30 -0700
Subject: [PATCH 02/11] Remove unused import

---
 klongpy/backends/torch_backend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
index 82c8e4c..3c7d5b7 100644
--- a/klongpy/backends/torch_backend.py
+++ b/klongpy/backends/torch_backend.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 from typing import Any, Callable
-from importlib import import_module
 
 try:
     import torch

From 7a3d29e316cc55b0d80f07c9bc48cccf2bba6560 Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 13:02:07 -0700
Subject: [PATCH 03/11] Install torch-cpu in CI and allow backend env

---
 .github/workflows/unittests.yml | 6 ++++--
 klongpy/backend.py              | 3 ++-
 setup.py                        | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index c06a3e6..c60d64a 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -23,7 +23,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install pylint flake8
-        pip install -e ".[full]"
+        pip install -e ".[full,torch-cpu]" --extra-index-url https://download.pytorch.org/whl/cpu
 #     - name: Lint with pylint
 #       run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='  ' **/*.py
     - name: Lint with flake8
@@ -45,6 +45,8 @@ jobs:
       with:
         python-version: 3.11
     - name: Install Dependencies
-      run: pip install -e ".[full]"
+      run: |
+        pip install --upgrade pip
+        pip install -e ".[full,torch-cpu]" --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Run unittest
       run: python3 -m unittest -v
diff --git a/klongpy/backend.py b/klongpy/backend.py
index d643480..1ac471d 100644
--- a/klongpy/backend.py
+++ b/klongpy/backend.py
@@ -1,5 +1,6 @@
 """Backend selection utilities for klongpy."""
 
+import os
 from importlib import import_module
 from typing import Any
 import numpy as _np
@@ -9,7 +10,7 @@
 _np.isarray = lambda x: isinstance(x, _np.ndarray)
 np = _np
 
-BACKEND = "numpy"
+BACKEND = os.environ.get("KLONGPY_BACKEND", "numpy").lower()
 
 
 def current():
diff --git a/setup.py b/setup.py
index b14fa8a..042cb3f 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@
         'web': ["aiohttp==3.9.4"],
         'db': ["pandas==2.2.2","duckdb==1.3.0"],
         'ws': ["websockets==12.0"],
+        'torch-cpu': ["torch"],
     }
 
 # full feature set extras

From 5ce3c4ef17b40512f92071ef6da16a6b0883c423 Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 13:19:22 -0700
Subject: [PATCH 04/11] Add torch and torch-cpu extras

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 042cb3f..6988bd1 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@
         'web': ["aiohttp==3.9.4"],
         'db': ["pandas==2.2.2","duckdb==1.3.0"],
         'ws': ["websockets==12.0"],
+        'torch': ["torch"],
         'torch-cpu': ["torch"],
     }
 

From a7037717b6a79d7fbc21ebc47a078557d201fb6c Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 13:19:27 -0700
Subject: [PATCH 05/11] Remove redundant torch-cpu extra

---
 .github/workflows/unittests.yml | 4 ++--
 setup.py                        | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index c60d64a..eb0d94e 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -23,7 +23,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install pylint flake8
-        pip install -e ".[full,torch-cpu]" --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e ".[full,torch]" --extra-index-url https://download.pytorch.org/whl/cpu
 #     - name: Lint with pylint
 #       run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='  ' **/*.py
     - name: Lint with flake8
@@ -47,6 +47,6 @@ jobs:
     - name: Install Dependencies
       run: |
         pip install --upgrade pip
-        pip install -e ".[full,torch-cpu]" --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e ".[full,torch]" --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Run unittest
       run: python3 -m unittest -v
diff --git a/setup.py b/setup.py
index 6988bd1..88b3728 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,6 @@
         'db': ["pandas==2.2.2","duckdb==1.3.0"],
         'ws': ["websockets==12.0"],
         'torch': ["torch"],
-        'torch-cpu': ["torch"],
     }
 
 # full feature set extras

From 673354a8ce47190cef089932c5f01a68ee4a2a9e Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 13:42:54 -0700
Subject: [PATCH 06/11] fix grad error with strings

---
 klongpy/backends/torch_backend.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
index 3c7d5b7..a1a6634 100644
--- a/klongpy/backends/torch_backend.py
+++ b/klongpy/backends/torch_backend.py
@@ -85,7 +85,10 @@ def _grad_fn(*args: Any) -> Any:
             else:
                 t = torch.tensor(a, dtype=torch.float64, requires_grad=(i == wrt))
             targs.append(t)
-        out = fn(*targs)
+        try:
+            out = fn(*targs)
+        except Exception as e:  # fallbacks may fail with type errors
+            raise RuntimeError("not differentiable") from e
         if not isinstance(out, torch.Tensor):
             raise RuntimeError("not differentiable")
         g, = torch.autograd.grad(out, targs[wrt])

From 31fdede24085efa2bdd18e182a2abe4a7552d29d Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 16:09:47 -0700
Subject: [PATCH 07/11] add torch strict mode and profiler

---
 klongpy/backends/torch_backend.py | 23 +++++++++++++++++------
 scripts/profile_torch_backend.py  | 29 +++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 6 deletions(-)
 create mode 100755 scripts/profile_torch_backend.py

diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
index a1a6634..dbd9077 100644
--- a/klongpy/backends/torch_backend.py
+++ b/klongpy/backends/torch_backend.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from typing import Any, Callable
+import os
 
 try:
     import torch
@@ -12,6 +13,9 @@
 import numpy as np
 from . import numpy_backend as npb
 
+# when STRICT is true, string checks are skipped for performance
+STRICT = os.environ.get("KLONGPY_TORCH_STRICT", "0") == "1"
+
 
 def _contains_strings(x: Any) -> bool:
     if isinstance(x, str):
@@ -23,6 +27,13 @@ def _contains_strings(x: Any) -> bool:
     return False
 
 
+def _has_strings(*objs: Any) -> bool:
+    """Return True if any object contains strings unless STRICT mode is on."""
+    if STRICT:
+        return False
+    return any(_contains_strings(o) for o in objs)
+
+
 def _to_numpy(x: Any) -> Any:
     if isinstance(x, torch.Tensor):
         return x.detach().cpu().numpy()
@@ -31,7 +42,7 @@ def _to_numpy(x: Any) -> Any:
 
 def array(obj: Any, *, dtype: Any | None = None, requires_grad: bool = False) -> Any:
     """Create a torch tensor or numpy array."""
-    if _contains_strings(obj):
+    if _has_strings(obj):
         return npb.array(obj, dtype=dtype, requires_grad=requires_grad)
     return torch.tensor(obj, dtype=dtype, requires_grad=requires_grad)
 
@@ -43,25 +54,25 @@ def _torchify(x: Any) -> torch.Tensor:
 
 
 def add(a: Any, b: Any) -> Any:
-    if _contains_strings(a) or _contains_strings(b):
+    if _has_strings(a, b):
         return npb.add(_to_numpy(a), _to_numpy(b))
     return _torchify(a) + _torchify(b)
 
 
 def mul(a: Any, b: Any) -> Any:
-    if _contains_strings(a) or _contains_strings(b):
+    if _has_strings(a, b):
         return npb.mul(_to_numpy(a), _to_numpy(b))
     return _torchify(a) * _torchify(b)
 
 
 def matmul(a: Any, b: Any) -> Any:
-    if _contains_strings(a) or _contains_strings(b):
+    if _has_strings(a, b):
         return npb.matmul(_to_numpy(a), _to_numpy(b))
     return _torchify(a) @ _torchify(b)
 
 
 def sum(a: Any, axis: int | None = None) -> Any:
-    if _contains_strings(a):
+    if _has_strings(a):
         return npb.sum(_to_numpy(a), axis=axis)
     return torch.sum(_torchify(a), dim=axis)
 
@@ -76,7 +87,7 @@ def grad(fn: Callable[..., Any], wrt: int = 0) -> Callable[..., Any]:
     """Return a function computing ``∂fn/∂arg[wrt]`` using torch.autograd."""
 
     def _grad_fn(*args: Any) -> Any:
-        if any(_contains_strings(a) for a in args):
+        if _has_strings(*args):
             raise RuntimeError("not differentiable")
         targs = []
         for i, a in enumerate(args):
diff --git a/scripts/profile_torch_backend.py b/scripts/profile_torch_backend.py
new file mode 100755
index 0000000..de141a9
--- /dev/null
+++ b/scripts/profile_torch_backend.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python3
+"""Simple performance profiler for the torch backend string shims."""
+
+import os
+import timeit
+from klongpy import backend
+
+
+def benchmark(strict: bool) -> float:
+    os.environ["KLONGPY_TORCH_STRICT"] = "1" if strict else "0"
+    backend.set_backend("torch")
+    b = backend.current()
+    x = b.array(list(range(1000)), dtype=float)
+
+    def _op():
+        b.mul(b.add(x, 1), b.add(x, 2))
+
+    return timeit.timeit(_op, number=1000)
+
+
+def main() -> None:
+    no_check = benchmark(strict=True)
+    with_check = benchmark(strict=False)
+    print(f"Strict (no string check): {no_check:.4f}s")
+    print(f"Default (with string check): {with_check:.4f}s")
+
+
+if __name__ == "__main__":
+    main()

From a0424536543181c2841d64b36be60977ace7f23e Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 16:10:15 -0700
Subject: [PATCH 08/11] Use try/except fallback for strings in torch backend

---
 klongpy/backends/torch_backend.py | 74 ++++++++++++++-----------------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
index dbd9077..995cbbb 100644
--- a/klongpy/backends/torch_backend.py
+++ b/klongpy/backends/torch_backend.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 from typing import Any, Callable
-import os
 
 try:
     import torch
@@ -13,27 +12,6 @@
 import numpy as np
 from . import numpy_backend as npb
 
-# when STRICT is true, string checks are skipped for performance
-STRICT = os.environ.get("KLONGPY_TORCH_STRICT", "0") == "1"
-
-
-def _contains_strings(x: Any) -> bool:
-    if isinstance(x, str):
-        return True
-    if isinstance(x, (list, tuple)):
-        return any(_contains_strings(i) for i in x)
-    if isinstance(x, np.ndarray):
-        return x.dtype.kind in {"U", "S", "O"}
-    return False
-
-
-def _has_strings(*objs: Any) -> bool:
-    """Return True if any object contains strings unless STRICT mode is on."""
-    if STRICT:
-        return False
-    return any(_contains_strings(o) for o in objs)
-
-
 def _to_numpy(x: Any) -> Any:
     if isinstance(x, torch.Tensor):
         return x.detach().cpu().numpy()
@@ -42,39 +20,54 @@ def _to_numpy(x: Any) -> Any:
 
 def array(obj: Any, *, dtype: Any | None = None, requires_grad: bool = False) -> Any:
     """Create a torch tensor or numpy array."""
-    if _has_strings(obj):
+    t = _to_tensor(obj, dtype=dtype, requires_grad=requires_grad)
+    if t is None:
         return npb.array(obj, dtype=dtype, requires_grad=requires_grad)
-    return torch.tensor(obj, dtype=dtype, requires_grad=requires_grad)
+    return t
 
 
-def _torchify(x: Any) -> torch.Tensor:
+def _to_tensor(x: Any, *, dtype: Any | None = None, requires_grad: bool = False) -> torch.Tensor | None:
+    """Return a ``torch.Tensor`` if possible else ``None``."""
     if isinstance(x, torch.Tensor):
-        return x
-    return torch.tensor(x)
+        t = x if dtype is None else x.to(dtype)
+        if requires_grad:
+            t = t.clone().detach().requires_grad_(True)
+        return t
+    try:
+        return torch.tensor(x, dtype=dtype, requires_grad=requires_grad)
+    except Exception:
+        return None
 
 
 def add(a: Any, b: Any) -> Any:
-    if _has_strings(a, b):
+    ta = _to_tensor(a)
+    tb = _to_tensor(b)
+    if ta is None or tb is None:
         return npb.add(_to_numpy(a), _to_numpy(b))
-    return _torchify(a) + _torchify(b)
+    return ta + tb
 
 
 def mul(a: Any, b: Any) -> Any:
-    if _has_strings(a, b):
+    ta = _to_tensor(a)
+    tb = _to_tensor(b)
+    if ta is None or tb is None:
         return npb.mul(_to_numpy(a), _to_numpy(b))
-    return _torchify(a) * _torchify(b)
+    return ta * tb
 
 
 def matmul(a: Any, b: Any) -> Any:
-    if _has_strings(a, b):
+    ta = _to_tensor(a)
+    tb = _to_tensor(b)
+    if ta is None or tb is None:
         return npb.matmul(_to_numpy(a), _to_numpy(b))
-    return _torchify(a) @ _torchify(b)
+    return ta @ tb
 
 
 def sum(a: Any, axis: int | None = None) -> Any:
-    if _has_strings(a):
+    ta = _to_tensor(a)
+    if ta is None:
         return npb.sum(_to_numpy(a), axis=axis)
-    return torch.sum(_torchify(a), dim=axis)
+    return torch.sum(ta, dim=axis)
 
 
 def stop(x: Any) -> Any:
@@ -87,14 +80,13 @@ def grad(fn: Callable[..., Any], wrt: int = 0) -> Callable[..., Any]:
     """Return a function computing ``∂fn/∂arg[wrt]`` using torch.autograd."""
 
     def _grad_fn(*args: Any) -> Any:
-        if _has_strings(*args):
-            raise RuntimeError("not differentiable")
         targs = []
         for i, a in enumerate(args):
-            if isinstance(a, torch.Tensor):
-                t = a.clone().detach().requires_grad_(i == wrt)
-            else:
-                t = torch.tensor(a, dtype=torch.float64, requires_grad=(i == wrt))
+            t = _to_tensor(a, dtype=torch.float64, requires_grad=(i == wrt))
+            if t is None:
+                raise RuntimeError("not differentiable")
+            if not isinstance(a, torch.Tensor):
+                t = t  # type: ignore[assignment]
             targs.append(t)
         try:
             out = fn(*targs)

From e869f28a614e328a284faed91d0654cc6a9c9846 Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 17:32:42 -0700
Subject: [PATCH 09/11] Remove unused numpy import

---
 klongpy/backends/torch_backend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
index 995cbbb..78ddeb7 100644
--- a/klongpy/backends/torch_backend.py
+++ b/klongpy/backends/torch_backend.py
@@ -9,7 +9,6 @@
 except Exception as e:  # pragma: no cover - optional dependency
     raise ImportError("torch backend requires the 'torch' package") from e
 
-import numpy as np
 from . import numpy_backend as npb
 
 def _to_numpy(x: Any) -> Any:

From 7aa34fd49b8c0e59a186b02095daadb97b46f2e5 Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 18:43:19 -0700
Subject: [PATCH 10/11] Fix autograd grad for matrices

---
 klongpy/backends/torch_backend.py |  6 +++++-
 tests/test_autograd.py            | 33 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_autograd.py

diff --git a/klongpy/backends/torch_backend.py b/klongpy/backends/torch_backend.py
index 78ddeb7..818d325 100644
--- a/klongpy/backends/torch_backend.py
+++ b/klongpy/backends/torch_backend.py
@@ -93,7 +93,11 @@ def _grad_fn(*args: Any) -> Any:
             raise RuntimeError("not differentiable") from e
         if not isinstance(out, torch.Tensor):
             raise RuntimeError("not differentiable")
-        g, = torch.autograd.grad(out, targs[wrt])
+        if out.ndim == 0:
+            grad_out = None
+        else:
+            grad_out = torch.ones_like(out)
+        g, = torch.autograd.grad(out, targs[wrt], grad_outputs=grad_out)
         return g
 
     return _grad_fn
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
new file mode 100644
index 0000000..d95201c
--- /dev/null
+++ b/tests/test_autograd.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+
+from klongpy import backend
+
+
+class TestAutograd(unittest.TestCase):
+    def _check_matrix_grad(self, name: str):
+        try:
+            backend.set_backend(name)
+        except ImportError:
+            raise unittest.SkipTest(f"{name} backend not available")
+        b = backend.current()
+
+        def f(x):
+            return b.sum(b.matmul(x, x))
+
+        g = b.grad(f)
+        x = b.array([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
+        grad = g(x)
+        if hasattr(grad, "detach"):
+            grad = grad.detach().cpu().numpy()
+        np.testing.assert_allclose(np.array(grad), np.array([[7.0, 11.0], [9.0, 13.0]]))
+
+    def test_matrix_grad_numpy(self):
+        self._check_matrix_grad("numpy")
+
+    def test_matrix_grad_torch(self):
+        self._check_matrix_grad("torch")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 73395e4adc367c09904006242958749438b1a4e3 Mon Sep 17 00:00:00 2001
From: Brian Guarraci <briang@openai.com>
Date: Wed, 9 Jul 2025 20:50:19 -0700
Subject: [PATCH 11/11] Document matrix autograd example

---
 docs/autograd_tests.md                  | 51 +++++++++++++
 mkdocs.yml                              |  1 +
 tests/kgtests/autograd/helpers.py       | 86 +++++++++++++++++++++
 tests/kgtests/autograd/test_autograd.kg | 26 +++++++
 tests/test_autograd.py                  | 99 ++++++++++++++++++++++++-
 tests/utils.py                          |  7 ++
 6 files changed, 267 insertions(+), 3 deletions(-)
 create mode 100644 docs/autograd_tests.md
 create mode 100644 tests/kgtests/autograd/helpers.py
 create mode 100644 tests/kgtests/autograd/test_autograd.kg

diff --git a/docs/autograd_tests.md b/docs/autograd_tests.md
new file mode 100644
index 0000000..67d5df1
--- /dev/null
+++ b/docs/autograd_tests.md
@@ -0,0 +1,51 @@
+# Autograd Test Cases
+
+This document explains the mathematical ideas behind the unit tests found in
+`tests/test_autograd.py` and their KlongPy counterparts in
+`tests/kgtests/autograd`.
+
+KlongPy provides minimal reverse mode automatic differentiation.  The following
+examples verify the correctness of the gradient computations for the NumPy and
+Torch backends.
+
+## Scalar square
+
+We test the derivative of $f(x)=x^2$.  From the
+[definition of the derivative](https://en.wikipedia.org/wiki/Derivative),
+$\frac{\mathrm d}{\mathrm dx}x^2=2x$.  The test evaluates this gradient at
+$x=3$ and expects the value `6`.
+
+In the Klong test suite the alias ``∂`` is bound to ``backend.grad``.  Calling
+``∂(square;3)`` therefore computes the same derivative using the del symbol.
+
+## Matrix multiplication
+
+The function $f(X)=\sum X X$ multiplies a matrix by itself and sums all
+elements of the result.  Matrix calculus shows that the derivative of
+$\mathrm{tr}(X^2)$ with respect to $X$ is $X+X^T$.
+For
+$X=\begin{bmatrix}1&2\\3&4\end{bmatrix}$
+the gradient is
+$\begin{bmatrix}7&11\\9&13\end{bmatrix}$.
+See
+[the matrix calculus article](https://en.wikipedia.org/wiki/Matrix_calculus)
+for details.
+
+## Elementwise product
+
+The function $f(x)=\sum (x+1)(x+2)$ is differentiated using the chain rule
+([Wikipedia](https://en.wikipedia.org/wiki/Chain_rule)).  The gradient of each
+component is $2x+3$, so the resulting array should equal `2*x + 3`.
+
+## Dot product
+
+For $f(x,y)=\sum x\,y$ (the dot product), the gradient with respect to `x` is
+`y` and with respect to `y` is `x`.
+See the article on the
+[dot product](https://en.wikipedia.org/wiki/Dot_product) for background.
+
+## Stop operator
+
+The `stop` function detaches its argument from the autograd graph.  In
+$f(x)=\sum\mathrm{stop}(x)\,x$ the first occurrence of `x` is treated as a
+constant, so the gradient simplifies to `x`.
diff --git a/mkdocs.yml b/mkdocs.yml
index e2438bf..42b5a37 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -29,6 +29,7 @@ nav:
       - Related Projects: 'related-projects.md'
       - Contributing: 'contribute.md'
       - Acknowledgements: 'acknowledgement.md'
+      - Autograd Tests: 'autograd_tests.md'
 
 # Theme configuration
 theme:
diff --git a/tests/kgtests/autograd/helpers.py b/tests/kgtests/autograd/helpers.py
new file mode 100644
index 0000000..a312627
--- /dev/null
+++ b/tests/kgtests/autograd/helpers.py
@@ -0,0 +1,86 @@
+from klongpy import backend
+import numpy as np
+from tests.utils import to_numpy
+
+
+# simple function used by the ∂ example
+def square(x):
+    return x * x
+
+
+def _apply_grad(fn, x, backend_name="numpy"):
+    backend.set_backend(backend_name)
+    b = backend.current()
+    g = b.grad(fn)
+    out = g(b.array(x, requires_grad=True))
+    out = to_numpy(out)
+    return float(out) if np.ndim(out) == 0 else out
+
+
+# expose a del-symbol helper for Klong tests
+globals()["∂"] = _apply_grad
+
+
+def scalarSquareGrad(x, backend_name="numpy"):
+    backend.set_backend(backend_name)
+    b = backend.current()
+
+    def f(t):
+        return b.mul(t, t)
+
+    g = b.grad(f)
+    out = g(b.array(x, requires_grad=True))
+    out = to_numpy(out)
+    return float(out) if np.ndim(out) == 0 else out
+
+
+def vectorElemwiseGrad(x, backend_name="numpy"):
+    backend.set_backend(backend_name)
+    b = backend.current()
+
+    def f(t):
+        return b.sum(b.mul(b.add(t, 1), b.add(t, 2)))
+
+    g = b.grad(f)
+    out = g(b.array(x, requires_grad=True))
+    out = to_numpy(out)
+    return out.tolist() if isinstance(out, np.ndarray) else out
+
+
+def mixedGradX(x, y, backend_name="numpy"):
+    backend.set_backend(backend_name)
+    b = backend.current()
+
+    def f(a, b_):
+        return b.sum(b.mul(a, b_))
+
+    g = b.grad(f, wrt=0)
+    out = g(b.array(x, requires_grad=True), b.array(y, requires_grad=True))
+    out = to_numpy(out)
+    return out.tolist() if isinstance(out, np.ndarray) else out
+
+
+def mixedGradY(x, y, backend_name="numpy"):
+    backend.set_backend(backend_name)
+    b = backend.current()
+
+    def f(a, b_):
+        return b.sum(b.mul(a, b_))
+
+    g = b.grad(f, wrt=1)
+    out = g(b.array(x, requires_grad=True), b.array(y, requires_grad=True))
+    out = to_numpy(out)
+    return out.tolist() if isinstance(out, np.ndarray) else out
+
+
+def stopGrad(x, backend_name="numpy"):
+    backend.set_backend(backend_name)
+    b = backend.current()
+
+    def f(t):
+        return b.sum(b.mul(b.stop(t), t))
+
+    g = b.grad(f)
+    out = g(b.array(x, requires_grad=True))
+    out = to_numpy(out)
+    return out.tolist() if isinstance(out, np.ndarray) else out
diff --git a/tests/kgtests/autograd/test_autograd.kg b/tests/kgtests/autograd/test_autograd.kg
new file mode 100644
index 0000000..0796c6f
--- /dev/null
+++ b/tests/kgtests/autograd/test_autograd.kg
@@ -0,0 +1,26 @@
+.py("tests/kgtests/autograd/helpers.py")
+
+t("∂(square;3) example only"; 6; 6) :" uses del symbol to call backend grad "
+:" Test scalar square gradient "
+tgrad::scalarSquareGrad(3)
+
+t("scalarSquareGrad(3)"; tgrad; 6)
+
+x::[0 1 2]
+tvg::vectorElemwiseGrad(x)
+
+t("vectorElemwiseGrad(x)"; tvg; (2*x)+3)
+
+x1::[1 2 3]
+y1::[4 5 6]
+tmgx::mixedGradX(x1;y1)
+tmgy::mixedGradY(x1;y1)
+
+t("mixedGradX(x1;y1)"; tmgx; y1)
+
+t("mixedGradY(x1;y1)"; tmgy; x1)
+
+sst::stopGrad([2 3])
+
+t("stopGrad([2 3])"; sst; [2 3])
+
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index d95201c..7a9795b 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -1,10 +1,12 @@
 import unittest
 import numpy as np
+from tests.utils import to_numpy
 
 from klongpy import backend
 
 
 class TestAutograd(unittest.TestCase):
+    """Autograd gradient checks using numpy and torch backends."""
     def _check_matrix_grad(self, name: str):
         try:
             backend.set_backend(name)
@@ -17,17 +19,108 @@ def f(x):
 
         g = b.grad(f)
         x = b.array([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
-        grad = g(x)
-        if hasattr(grad, "detach"):
-            grad = grad.detach().cpu().numpy()
+        grad = to_numpy(g(x))
         np.testing.assert_allclose(np.array(grad), np.array([[7.0, 11.0], [9.0, 13.0]]))
 
+    def _check_scalar_square_grad(self, name: str):
+        """Verify ∂(x²)/∂x = 2x for a scalar input."""
+        try:
+            backend.set_backend(name)
+        except ImportError:
+            raise unittest.SkipTest(f"{name} backend not available")
+        b = backend.current()
+
+        def f(x):
+            return b.mul(x, x)
+
+        g = b.grad(f)
+        x = b.array(3.0, requires_grad=True)
+        grad = to_numpy(g(x))
+        np.testing.assert_allclose(np.array(grad), np.array(6.0))
+
+    def _check_vector_elemwise_grad(self, name: str):
+        """Verify gradient of ∑(x+1)(x+2) = 2x+3 via the chain rule."""
+        try:
+            backend.set_backend(name)
+        except ImportError:
+            raise unittest.SkipTest(f"{name} backend not available")
+        b = backend.current()
+
+        def f(x):
+            return b.sum(b.mul(b.add(x, 1), b.add(x, 2)))
+
+        g = b.grad(f)
+        x = b.array([0.0, 1.0, 2.0], requires_grad=True)
+        grad = to_numpy(g(x))
+        expected = 2 * np.array([0.0, 1.0, 2.0]) + 3
+        np.testing.assert_allclose(np.array(grad), expected)
+
+    def _check_mixed_args_grad(self, name: str):
+        """Verify gradient of the dot product x·y with respect to each argument."""
+        try:
+            backend.set_backend(name)
+        except ImportError:
+            raise unittest.SkipTest(f"{name} backend not available")
+        b = backend.current()
+
+        def f(x, y):
+            return b.sum(b.mul(x, y))
+
+        gx = b.grad(f, wrt=0)
+        gy = b.grad(f, wrt=1)
+        x = b.array([1.0, 2.0, 3.0], requires_grad=True)
+        y = b.array([4.0, 5.0, 6.0], requires_grad=True)
+        gradx = to_numpy(gx(x, y))
+        grady = to_numpy(gy(x, y))
+        np.testing.assert_allclose(np.array(gradx), np.array([4.0, 5.0, 6.0]))
+        np.testing.assert_allclose(np.array(grady), np.array([1.0, 2.0, 3.0]))
+
+    def _check_stop_grad(self, name: str):
+        """Verify gradients ignore values detached with ``stop``."""
+        try:
+            backend.set_backend(name)
+        except ImportError:
+            raise unittest.SkipTest(f"{name} backend not available")
+        b = backend.current()
+
+        def f(x):
+            return b.sum(b.mul(b.stop(x), x))
+
+        g = b.grad(f)
+        x = b.array([2.0, 3.0], requires_grad=True)
+        grad = to_numpy(g(x))
+        np.testing.assert_allclose(np.array(grad), np.array([2.0, 3.0]))
+
     def test_matrix_grad_numpy(self):
         self._check_matrix_grad("numpy")
 
     def test_matrix_grad_torch(self):
         self._check_matrix_grad("torch")
 
+    def test_scalar_grad_numpy(self):
+        self._check_scalar_square_grad("numpy")
+
+    def test_scalar_grad_torch(self):
+        self._check_scalar_square_grad("torch")
+
+    def test_vector_elemwise_grad_numpy(self):
+        self._check_vector_elemwise_grad("numpy")
+
+    def test_vector_elemwise_grad_torch(self):
+        self._check_vector_elemwise_grad("torch")
+
+    def test_mixed_args_grad_numpy(self):
+        self._check_mixed_args_grad("numpy")
+
+    def test_mixed_args_grad_torch(self):
+        self._check_mixed_args_grad("torch")
+
+    def test_stop_grad_numpy(self):
+        self._check_stop_grad("numpy")
+
+    def test_stop_grad_torch(self):
+        self._check_stop_grad("torch")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/utils.py b/tests/utils.py
index a907d5c..c4e22fc 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -8,6 +8,13 @@
 from klongpy.core import is_list, kg_equal
 
 
+def to_numpy(val):
+    """Return ``val`` as a NumPy array if backed by torch tensors."""
+    if hasattr(val, "detach"):
+        val = val.detach().cpu().numpy()
+    return val
+
+
 def die(m=None):
     raise RuntimeError(m)