From 1c73800bf897f5cb13e085a43340958621554a3d Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 8 Apr 2026 17:08:37 -0500 Subject: [PATCH 1/4] cuda jl tasking documentation updates --- .githash | 2 +- .pre-commit-config.yaml | 10 +++++ docs/src/api.md | 81 ++++++++++++++++++++++++++++++++++++++++- ext/CUDAExt/CUDAExt.jl | 2 +- ext/CUDAExt/cuda.jl | 4 ++ src/scoping.jl | 2 - src/util.jl | 15 ++++++++ src/warnings.jl | 2 - 8 files changed, 111 insertions(+), 7 deletions(-) diff --git a/.githash b/.githash index 87bef491..e1142e7d 100644 --- a/.githash +++ b/.githash @@ -1 +1 @@ -224cac24f15449a338abebe1bc17ba74c07d9f5c +0c87a32be84ecaf7346c8f4f028eae208a5a7cda diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7a201053..37b95947 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,4 @@ +default_stages: [pre-commit] repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 @@ -21,3 +22,12 @@ repos: entry: julia --project=. -e 'using JuliaFormatter; format(ARGS, verbose=true)' language: system types: [julia] + + - id: update-githash + name: Update .githash + description: "Write the current git commit hash to .githash" + entry: bash -c 'git rev-parse HEAD > .githash' + language: system + always_run: true + pass_filenames: false + stages: [post-commit] diff --git a/docs/src/api.md b/docs/src/api.md index fbeaa3d5..0ab680b7 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -14,7 +14,86 @@ Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumer ``` # CUDA.jl Tasking -This section will detail how to use custom CUDA.jl kernels with the Legate runtime. This is still a work in progress + +Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling. + +!!! warning "Experimental Feature" + CUDA.jl tasking is experimental. You must opt in before using `@cuda_task` or `@launch`: + ```julia + cuNumeric.Experimental(true) + ``` + +The interface has two steps: +1. **Compile & Register** — [`@cuda_task`](@ref) JIT-compiles a kernel to PTX and registers it with Legate. +2. **Launch** — [`@launch`](@ref) submits the kernel with grid dimensions, inputs, outputs, and scalars. + +`NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy. + +!!! note "Argument ordering" + Legate passes kernel arguments in the order: **inputs → outputs → scalars**. + Your kernel signature must match this ordering. + +!!! warning "Inputs vs. outputs" + Correctly separating `inputs` and `outputs` is critical for Legate's + dependency analysis. If an array is both read and written, list it as an `output`. + +## Example + +```julia +using cuNumeric +using CUDA +import CUDA: i32 + +# Enable experimental features +cuNumeric.Experimental(true) + +# 1. Write a standard CUDA.jl kernel +function kernel_sin(a, b, N) + i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x + if i <= N + @inbounds b[i] = sin(a[i]) + end + return nothing +end + +N = 1024 +threads = 256 +blocks = cld(N, threads) + +a = cuNumeric.fill(1.0f0, N) +b = cuNumeric.zeros(Float32, N) + +# 2. Compile & register — args are used only for type inference +task = cuNumeric.@cuda_task kernel_sin(a, b, UInt32(1)) + +# 3. Launch through Legate +cuNumeric.@launch task=task threads=threads blocks=blocks inputs=a outputs=b scalars=UInt32(N) + +allowscalar() do + println("sin(1) = ", b[:][1]) # ≈ 0.8414709 +end +``` + +See `examples/custom_cuda.jl` for a more complete example with multiple kernels. + +## `@launch` Keywords + +| Keyword | Type | Default | Description | +|-----------|----------------------|----------|--------------------------------| +| `task` | `CUDATask` | required | Compiled kernel handle | +| `blocks` | `Int` or `Tuple` | `(1,)` | CUDA grid dimensions | +| `threads` | `Int` or `Tuple` | `(256,)` | CUDA block dimensions | +| `inputs` | `NDArray` or `Tuple` | `()` | Read-only input arrays | +| `outputs` | `NDArray` or `Tuple` | `()` | Read-write output arrays | +| `scalars` | scalar or `Tuple` | `()` | Scalar kernel arguments | + +## Limitations + +- Only `NDArray` objects are supported — raw `CuArray` cannot be passed directly. +- Mismatched array sizes are automatically padded to the largest shape. +- Custom function broadcasting is not supported; write explicit index-based kernels. + +## API Reference ```@autodocs Modules = [cuNumeric] diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index e991aac6..ed9c07c6 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -6,7 +6,7 @@ using Legate: Legate using CxxWrap: CxxWrap using cuNumeric: cuNumeric import cuNumeric: - @cuda_task, @launch, NDArray + @cuda_task, @launch, NDArray, assert_experimental const KERNEL_OFFSET = sizeof(CUDA.KernelState) diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl index a70a5b90..fcb7a003 100644 --- a/ext/CUDAExt/cuda.jl +++ b/ext/CUDAExt/cuda.jl @@ -143,6 +143,8 @@ function cuNumeric.ptx_task(ptx::String, kernel_name) end macro cuda_task(call_expr) + cuNumeric.assert_experimental() + fname = call_expr.args[1] fargs = call_expr.args[2:end] @@ -164,6 +166,8 @@ macro cuda_task(call_expr) end macro launch(args...) + cuNumeric.assert_experimental() + allowed_keys = Set([:task, :blocks, :threads, :inputs, :outputs, :scalars]) kwargs = Dict{Symbol,Any}() diff --git a/src/scoping.jl b/src/scoping.jl index ecac81ca..6983c455 100644 --- a/src/scoping.jl +++ b/src/scoping.jl @@ -1,5 +1,3 @@ -# TODO reduce number of allocations. Potentially remove assigned_vars somehow - export @cunumeric @doc""" diff --git a/src/util.jl b/src/util.jl index a47f3ad5..96856ab7 100644 --- a/src/util.jl +++ b/src/util.jl @@ -15,3 +15,18 @@ preceding the call to this function. function get_time_nanoseconds() return Legate.time_nanoseconds() end + +function Experimental(setting::Bool) + task_local_storage(:Experimental, setting) +end + +function assert_experimental() + if get(task_local_storage(), :Experimental, false) !== true + throw( + ArgumentError( + "Experimental features are disabled." * + " Use `cuNumeric.Experimental(true)` to enable them.", + ), + ) + end +end diff --git a/src/warnings.jl b/src/warnings.jl index c993571e..db0ae37f 100644 --- a/src/warnings.jl +++ b/src/warnings.jl @@ -1,5 +1,3 @@ -### THE SCALAR INDEXING LOGIC IS COPIED FROM GPUArrays.jl ### - export allowpromotion, @allowpromotion, assertpromotion, allowscalar, @allowscalar, assertscalar @enum ImplicitPromotion PromotionAllowed PromotionWarn PromotionWarned PromotionDisallowed From 54c8286aed55dc3987e726d5defc67dbb09ff41f Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 8 Apr 2026 17:35:44 -0500 Subject: [PATCH 2/4] fix cuda.jl to support claims made in documentation --- .githash | 2 +- docs/src/api.md | 2 +- ext/CUDAExt/cuda.jl | 17 +++++++++++------ lib/cunumeric_jl_wrapper/src/cuda.cpp | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.githash b/.githash index e1142e7d..eb6896ec 100644 --- a/.githash +++ b/.githash @@ -1 +1 @@ -0c87a32be84ecaf7346c8f4f028eae208a5a7cda +1c73800bf897f5cb13e085a43340958621554a3d diff --git a/docs/src/api.md b/docs/src/api.md index 0ab680b7..51d5879f 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -97,7 +97,7 @@ See `examples/custom_cuda.jl` for a more complete example with multiple kernels. ```@autodocs Modules = [cuNumeric] -Pages = ["cuda.jl"] +Pages = ["utilities/cuda_stubs.jl"] ``` # CNPreferences diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl index fcb7a003..05d7e9f1 100644 --- a/ext/CUDAExt/cuda.jl +++ b/ext/CUDAExt/cuda.jl @@ -74,6 +74,11 @@ function check_sz(arr, maxshape) end end +function nda_to_logical_array(arr::NDArray{T,N}) where {T,N} + st_handle = cuNumeric.get_store(arr) + return Legate.LogicalArray{T,N}(st_handle[], size(arr)) +end + function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads) @@ -92,16 +97,16 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, input_vars = Vector{Legate.Variable}() for arr in inputs check_sz!(arr, max_shape; copy=true) - store = cuNumeric.get_store(arr) - p = Legate.add_input(task, store) + la = nda_to_logical_array(arr) + p = Legate.add_input(task, la) push!(input_vars, p) end output_vars = Vector{Legate.Variable}() for arr in outputs check_sz!(arr, max_shape; copy=false) - store = cuNumeric.get_store(arr) - p = Legate.add_output(task, store) + la = nda_to_logical_array(arr) + p = Legate.add_output(task, la) push!(output_vars, p) end @@ -115,8 +120,8 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, Legate.add_scalar(task, Legate.Scalar(s)) # 7+ -> ARG_OFFSET end - # all inputs are alligned with all outputs - Legate.add_default_alignment(task, input_vars, output_vars) + # all inputs are aligned with all outputs + Legate.default_alignment(task, input_vars, output_vars) Legate.submit_auto_task(rt, task) end diff --git a/lib/cunumeric_jl_wrapper/src/cuda.cpp b/lib/cunumeric_jl_wrapper/src/cuda.cpp index 4d5256bf..d5e779d0 100644 --- a/lib/cunumeric_jl_wrapper/src/cuda.cpp +++ b/lib/cunumeric_jl_wrapper/src/cuda.cpp @@ -29,7 +29,7 @@ #include "types.h" #include "ufi.h" -#define CUDA_DEBUG 1 +#define CUDA_DEBUG 0 #define BLOCK_START 1 #define THREAD_START 4 From 17adad8a6f9dfd2a79e3a9297020abeedfe5a10f Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 8 Apr 2026 17:45:40 -0500 Subject: [PATCH 3/4] change ordering of documentation --- .githash | 2 +- docs/src/api.md | 33 +++++++++------------------------ 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/.githash b/.githash index eb6896ec..505cc7e0 100644 --- a/.githash +++ b/.githash @@ -1 +1 @@ -1c73800bf897f5cb13e085a43340958621554a3d +54c8286aed55dc3987e726d5defc67dbb09ff41f diff --git a/docs/src/api.md b/docs/src/api.md index 51d5879f..155da1ac 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -13,6 +13,15 @@ Modules = [cuNumeric] Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumeric.jl", "warnings.jl", "util.jl", "memory.jl", "scoping.jl"] ``` +# CNPreferences + +This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md). + +```@autodocs +Modules = [CNPreferences] +Pages = ["CNPreferences.jl"] +``` + # CUDA.jl Tasking Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling. @@ -29,10 +38,6 @@ The interface has two steps: `NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy. -!!! note "Argument ordering" - Legate passes kernel arguments in the order: **inputs → outputs → scalars**. - Your kernel signature must match this ordering. - !!! warning "Inputs vs. outputs" Correctly separating `inputs` and `outputs` is critical for Legate's dependency analysis. If an array is both read and written, list it as an `output`. @@ -76,17 +81,6 @@ end See `examples/custom_cuda.jl` for a more complete example with multiple kernels. -## `@launch` Keywords - -| Keyword | Type | Default | Description | -|-----------|----------------------|----------|--------------------------------| -| `task` | `CUDATask` | required | Compiled kernel handle | -| `blocks` | `Int` or `Tuple` | `(1,)` | CUDA grid dimensions | -| `threads` | `Int` or `Tuple` | `(256,)` | CUDA block dimensions | -| `inputs` | `NDArray` or `Tuple` | `()` | Read-only input arrays | -| `outputs` | `NDArray` or `Tuple` | `()` | Read-write output arrays | -| `scalars` | scalar or `Tuple` | `()` | Scalar kernel arguments | - ## Limitations - Only `NDArray` objects are supported — raw `CuArray` cannot be passed directly. @@ -100,15 +94,6 @@ Modules = [cuNumeric] Pages = ["utilities/cuda_stubs.jl"] ``` -# CNPreferences - -This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md). - -```@autodocs -Modules = [CNPreferences] -Pages = ["CNPreferences.jl"] -``` - # Internal API ```@autodocs From a54b006495ac58dd42c7b85cbbb38d00af89775a Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 8 Apr 2026 17:59:16 -0500 Subject: [PATCH 4/4] even more simple --- .githash | 2 +- docs/src/api.md | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.githash b/.githash index 505cc7e0..6dd09371 100644 --- a/.githash +++ b/.githash @@ -1 +1 @@ -54c8286aed55dc3987e726d5defc67dbb09ff41f +17adad8a6f9dfd2a79e3a9297020abeedfe5a10f diff --git a/docs/src/api.md b/docs/src/api.md index 155da1ac..91ae5d5a 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -42,6 +42,9 @@ The interface has two steps: Correctly separating `inputs` and `outputs` is critical for Legate's dependency analysis. If an array is both read and written, list it as an `output`. +!!! warning "Array sizes" + Mismatched array sizes are automatically padded to the largest shape. To address this, we plan to add support for other Legate constraints in the future (more information [here](https://docs.nvidia.com/legate/latest/api/cpp/generated/group/group__partitioning.html)). + ## Example ```julia @@ -81,12 +84,6 @@ end See `examples/custom_cuda.jl` for a more complete example with multiple kernels. -## Limitations - -- Only `NDArray` objects are supported — raw `CuArray` cannot be passed directly. -- Mismatched array sizes are automatically padded to the largest shape. -- Custom function broadcasting is not supported; write explicit index-based kernels. - ## API Reference ```@autodocs