Fix mishandling of large matrices in batched eigendecomposition.

hawkinsp · Google-ML-Automation · commit 0627c84f2281 · 2025-11-11T08:27:18.000-08:00
Fixes #33062 PiperOrigin-RevId: 830926150
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -43,6 +43,11 @@ When releasing, please add the new-release-boilerplate to docs/pallas/CHANGELOG.
     decomposition on CUDA GPUs. This is also an alias for the existing algorithm
     on TPUs.
 
+* Bug fixes:
+
+  * Fixed a bug introduced in JAX 0.7.2 where eigh failed for large matrices on
+    GPU (({jax-issue}`#33062`).
+
 * Deprecations:
   * Default `axis_types` of `jax.make_mesh` will change in JAX v0.9.0 to return
   `jax.sharding.AxisType.Explicit`. Leaving axis_types unspecified will raise a
diff --git a/jaxlib/gpu/solver_kernels_ffi.cc b/jaxlib/gpu/solver_kernels_ffi.cc
@@ -537,9 +537,10 @@ ffi::Error Syevd64Impl(int64_t batch, int64_t n, gpuStream_t stream,
   int64_t batch_step = 1;
   FFI_ASSIGN_OR_RETURN(bool is_batched_syev_supported,
                        IsSyevBatchedSupported());
-  if (is_batched_syev_supported) {
+  if (is_batched_syev_supported && n > 0) {
     int64_t matrix_size = n * n * ffi::ByteWidth(dataType);
-    batch_step = std::numeric_limits<int>::max() / matrix_size;
+    batch_step =
+        std::max(int64_t(1), std::numeric_limits<int>::max() / matrix_size);
     if (batch_step >= 32 * 1024) {
       batch_step = 32 * 1024;
     }
@@ -585,7 +586,7 @@ ffi::Error Syevd64Impl(int64_t batch, int64_t n, gpuStream_t stream,
 
   for (int64_t i = 0; i < batch; i += batch_step) {
     size_t batch_size = static_cast<size_t>(std::min(batch_step, batch - i));
-    if (is_batched_syev_supported) {
+    if (is_batched_syev_supported && batch_step > 1) {
       JAX_FFI_RETURN_IF_GPU_ERROR(gpusolverDnXsyevBatched(
           handle.get(), params, jobz, uplo, n, aType, out_data, n, wType,
           w_data, aType, workspaceOnDevice, workspaceInBytesOnDevice,
diff --git a/tests/linalg_test.py b/tests/linalg_test.py
@@ -15,6 +15,7 @@
 from functools import partial
 import itertools
 from collections.abc import Iterator
+import unittest
 
 import numpy as np
 import scipy
@@ -457,6 +458,15 @@ def testEigh(self, n, dtype, lower):
         w_np.astype(w.dtype), w, atol=tol * np.linalg.norm(a), rtol=tol
     )
 
+  @jax._src.config.explicit_x64_dtypes("allow")
+  @jtu.run_on_devices("gpu")
+  @unittest.skip("Needs a large amount of GPU memory, doesn't work in CI")
+  def testEighLargeMatrix(self):
+    # https://github.com/jax-ml/jax/issues/33062
+    n = 16384
+    A = jnp.eye(n, dtype=jnp.float64)
+    jax.block_until_ready(jax.lax.linalg.eigh(A))
+
   @jtu.sample_product(
       start=[0, 1, 63, 64, 65, 255],
       end=[1, 63, 64, 65, 256],