Fix building with CUDA toolkit 13.2 by zcbenz · Pull Request #3273 · ml-explore/mlx

zcbenz · 2026-03-18T11:30:35Z

Add __launch_bounds__ to binary_vv to work around a CUDA 13.2 bug.
Forbid building with CUDA 13.1, close Illegal memory access in reduce kernel when built with CUDA Toolkit 13.1 #3109.
Remove nccl_stub which had stopped compiling for a while and not really needed any more.

angeloskath

Awesome!

Out of curiosity why the need for launch bounds?

zcbenz · 2026-03-18T22:16:47Z

When __launch_bounds__ is not specified CUDA would use heuristics to determine resources used by the kernel, and CUDA 13.2 seems to have a bug requesting too many resources and kernel would fail to launch with "too many resources requested for launch".

zcbenz · 2026-03-21T00:32:56Z

For future reference here is the independent test case for the CUDA 13.2 bug:

// bug.cu
// Build: nvcc bug.cu -o bug -std=c++20 "--generate-code=arch=compute_120a,code=[compute_120a,sm_120a]"
// Run: ./bug

#define CCCL_IGNORE_MSVC_TRADITIONAL_PREPROCESSOR_WARNING

#include <cuda_runtime.h>
#include <iostream>
#include <cstdlib>
#include <cuda/cmath>

#include <cooperative_groups.h>

namespace {

namespace cg = cooperative_groups;

template <typename T, int N>
struct alignas(sizeof(T) * N) AlignedVector {
  T val[N];

  __device__ T& operator[](int i) {
    return val[i];
  }

  __device__ T operator[](int i) const {
    return val[i];
  }
};

template <int N, typename T>
inline __host__ __device__ bool is_aligned(T* x) {
  return (reinterpret_cast<uintptr_t>(x) % (N * sizeof(T))) == 0;
}

template <int N, typename T>
inline __device__ AlignedVector<T, N> unsafe_load_vector(
    const T* ptr,
    uint32_t offset) {
  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
  return from[offset];
}

template <int N, typename T>
inline __device__ AlignedVector<T, N> load_vector(
    const T* ptr,
    uint32_t offset) {
  if (is_aligned<N>(ptr)) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] = ptr[offset * N + i];
    }
    return v;
  }
}

template <int N, typename T, typename SizeT>
inline __device__ AlignedVector<T, N>
load_vector(const T* ptr, uint32_t offset, SizeT size, T fallback) {
  if (is_aligned<N>(ptr) && (offset + 1) * N <= size) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] = (N * offset + i) < size ? ptr[offset * N + i] : fallback;
    }
    return v;
  }
}

template <int N, typename T, typename SizeT>
inline __device__ AlignedVector<T, N> load_vector(
    const T* ptr,
    uint32_t offset,
    SizeT size,
    int64_t stride,
    T fallback) {
  if (is_aligned<N>(ptr) && stride == 1 && (offset + 1) * N <= size) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] =
          (N * offset + i) < size ? ptr[stride * (offset * N + i)] : fallback;
    }
    return v;
  }
}

template <int N, typename T>
inline __device__ void
unsafe_store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
  to[offset] = vec;
}

template <int N, typename T>
inline __device__ void
store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  if (is_aligned<N>(ptr)) {
    auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
    to[offset] = vec;
  } else {
#pragma unroll
    for (int i = 0; i < N; ++i) {
      ptr[offset * N + i] = vec[i];
    }
  }
}


struct Maximum {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return max(x, y);
    } else {
      if (cuda::std::isnan(x)) {
        return x;
      }
      return x > y ? x : y;
    }
  }
};

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void binary_vv(
    const In* a,
    const In* b,
    Out* out,
    IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[i], b[i]);
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);
    auto b_vec = load_vector<N_READS>(b, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a_vec[i], b_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

void checkCuda(cudaError_t err, const char* where) {
  if (err != cudaSuccess) {
    std::cerr << "CUDA error at " << where << ": " << cudaGetErrorString(err) << "\n";
    std::exit(1);
  }
}

} // namespace

int main() {
  uint32_t size = 16384;
  uint8_t* a;
  uint8_t* b;
  checkCuda(cudaMalloc(&a, size * sizeof(uint8_t)), "cudaMallocManaged");
  checkCuda(cudaMalloc(&b, size * sizeof(uint8_t)), "cudaMallocManaged");

  auto* func = binary_vv<Maximum, uint8_t, uint8_t, uint32_t, 16>;
  void* args[] = {&a, &a, &b, &size};

  cudaLaunchConfig_t config = {};
  config.gridDim = 2;
  config.blockDim = 1024;
  checkCuda(cudaLaunchKernelExC(&config, func, args), "cudaLaunchKernelExC");

  checkCuda(cudaGetLastError(), "kernel launch");
  checkCuda(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

  cudaFree(a);
  printf("OK\n");
  return 0;
}

zcbenz added 4 commits March 18, 2026 10:33

Forbid building with CUDA 13.1

ed9ffdf

Remove nccl_stubs

0f757ad

Add __launch_bounds__ for binary ops

8522964

Fix test on windows

ce04042

angeloskath approved these changes Mar 18, 2026

View reviewed changes

zcbenz merged commit 75f74ea into ml-explore:main Mar 18, 2026
16 checks passed

zcbenz deleted the cuda-13-2 branch March 18, 2026 23:31

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix building with CUDA toolkit 13.2#3273

Fix building with CUDA toolkit 13.2#3273
zcbenz merged 4 commits intoml-explore:mainfrom
zcbenz:cuda-13-2

zcbenz commented Mar 18, 2026

Uh oh!

angeloskath left a comment

Uh oh!

zcbenz commented Mar 18, 2026

Uh oh!

Uh oh!

zcbenz commented Mar 21, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

zcbenz commented Mar 18, 2026

Uh oh!

angeloskath left a comment

Choose a reason for hiding this comment

Uh oh!

zcbenz commented Mar 18, 2026

Uh oh!

Uh oh!

zcbenz commented Mar 21, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants