Skip to content

Fix building with CUDA toolkit 13.2#3273

Merged
zcbenz merged 4 commits intoml-explore:mainfrom
zcbenz:cuda-13-2
Mar 18, 2026
Merged

Fix building with CUDA toolkit 13.2#3273
zcbenz merged 4 commits intoml-explore:mainfrom
zcbenz:cuda-13-2

Conversation

@zcbenz
Copy link
Copy Markdown
Collaborator

@zcbenz zcbenz commented Mar 18, 2026

Copy link
Copy Markdown
Member

@angeloskath angeloskath left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!

Out of curiosity why the need for launch bounds?

@zcbenz
Copy link
Copy Markdown
Collaborator Author

zcbenz commented Mar 18, 2026

When __launch_bounds__ is not specified CUDA would use heuristics to determine resources used by the kernel, and CUDA 13.2 seems to have a bug requesting too many resources and kernel would fail to launch with "too many resources requested for launch".

@zcbenz zcbenz merged commit 75f74ea into ml-explore:main Mar 18, 2026
16 checks passed
@zcbenz zcbenz deleted the cuda-13-2 branch March 18, 2026 23:31
@zcbenz
Copy link
Copy Markdown
Collaborator Author

zcbenz commented Mar 21, 2026

For future reference here is the independent test case for the CUDA 13.2 bug:

// bug.cu
// Build: nvcc bug.cu -o bug -std=c++20 "--generate-code=arch=compute_120a,code=[compute_120a,sm_120a]"
// Run: ./bug

#define CCCL_IGNORE_MSVC_TRADITIONAL_PREPROCESSOR_WARNING

#include <cuda_runtime.h>
#include <iostream>
#include <cstdlib>
#include <cuda/cmath>

#include <cooperative_groups.h>

namespace {

namespace cg = cooperative_groups;

template <typename T, int N>
struct alignas(sizeof(T) * N) AlignedVector {
  T val[N];

  __device__ T& operator[](int i) {
    return val[i];
  }

  __device__ T operator[](int i) const {
    return val[i];
  }
};

template <int N, typename T>
inline __host__ __device__ bool is_aligned(T* x) {
  return (reinterpret_cast<uintptr_t>(x) % (N * sizeof(T))) == 0;
}

template <int N, typename T>
inline __device__ AlignedVector<T, N> unsafe_load_vector(
    const T* ptr,
    uint32_t offset) {
  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
  return from[offset];
}

template <int N, typename T>
inline __device__ AlignedVector<T, N> load_vector(
    const T* ptr,
    uint32_t offset) {
  if (is_aligned<N>(ptr)) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] = ptr[offset * N + i];
    }
    return v;
  }
}

template <int N, typename T, typename SizeT>
inline __device__ AlignedVector<T, N>
load_vector(const T* ptr, uint32_t offset, SizeT size, T fallback) {
  if (is_aligned<N>(ptr) && (offset + 1) * N <= size) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] = (N * offset + i) < size ? ptr[offset * N + i] : fallback;
    }
    return v;
  }
}

template <int N, typename T, typename SizeT>
inline __device__ AlignedVector<T, N> load_vector(
    const T* ptr,
    uint32_t offset,
    SizeT size,
    int64_t stride,
    T fallback) {
  if (is_aligned<N>(ptr) && stride == 1 && (offset + 1) * N <= size) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] =
          (N * offset + i) < size ? ptr[stride * (offset * N + i)] : fallback;
    }
    return v;
  }
}

template <int N, typename T>
inline __device__ void
unsafe_store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
  to[offset] = vec;
}

template <int N, typename T>
inline __device__ void
store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  if (is_aligned<N>(ptr)) {
    auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
    to[offset] = vec;
  } else {
#pragma unroll
    for (int i = 0; i < N; ++i) {
      ptr[offset * N + i] = vec[i];
    }
  }
}


struct Maximum {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return max(x, y);
    } else {
      if (cuda::std::isnan(x)) {
        return x;
      }
      return x > y ? x : y;
    }
  }
};

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void binary_vv(
    const In* a,
    const In* b,
    Out* out,
    IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[i], b[i]);
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);
    auto b_vec = load_vector<N_READS>(b, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a_vec[i], b_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

void checkCuda(cudaError_t err, const char* where) {
  if (err != cudaSuccess) {
    std::cerr << "CUDA error at " << where << ": " << cudaGetErrorString(err) << "\n";
    std::exit(1);
  }
}

} // namespace

int main() {
  uint32_t size = 16384;
  uint8_t* a;
  uint8_t* b;
  checkCuda(cudaMalloc(&a, size * sizeof(uint8_t)), "cudaMallocManaged");
  checkCuda(cudaMalloc(&b, size * sizeof(uint8_t)), "cudaMallocManaged");

  auto* func = binary_vv<Maximum, uint8_t, uint8_t, uint32_t, 16>;
  void* args[] = {&a, &a, &b, &size};

  cudaLaunchConfig_t config = {};
  config.gridDim = 2;
  config.blockDim = 1024;
  checkCuda(cudaLaunchKernelExC(&config, func, args), "cudaLaunchKernelExC");

  checkCuda(cudaGetLastError(), "kernel launch");
  checkCuda(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

  cudaFree(a);
  printf("OK\n");
  return 0;
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Illegal memory access in reduce kernel when built with CUDA Toolkit 13.1

2 participants