JuliaLegate · krasow · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.githash b/.githash
@@ -1 +1 @@
-224cac24f15449a338abebe1bc17ba74c07d9f5c
+17adad8a6f9dfd2a79e3a9297020abeedfe5a10f
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+default_stages: [pre-commit]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.6.0
@@ -21,3 +22,12 @@ repos:
         entry: julia --project=. -e 'using JuliaFormatter; format(ARGS, verbose=true)'
         language: system
         types: [julia]
+
+      - id: update-githash
+        name: Update .githash
+        description: "Write the current git commit hash to .githash"
+        entry: bash -c 'git rev-parse HEAD > .githash'
+        language: system
+        always_run: true
+        pass_filenames: false
+        stages: [post-commit]
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -13,14 +13,6 @@ Modules = [cuNumeric]
 Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumeric.jl", "warnings.jl", "util.jl", "memory.jl", "scoping.jl"]
 ```
 
-# CUDA.jl Tasking
-This section will detail how to use custom CUDA.jl kernels with the Legate runtime. This is still a work in progress
-
-```@autodocs
-Modules = [cuNumeric]
-Pages = ["cuda.jl"]
-```
-
 # CNPreferences
 
 This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md).
@@ -30,6 +22,75 @@ Modules = [CNPreferences]
 Pages = ["CNPreferences.jl"]
 ```
 
+# CUDA.jl Tasking
+
+Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling.
+
+!!! warning "Experimental Feature"
+    CUDA.jl tasking is experimental. You must opt in before using `@cuda_task` or `@launch`:
+    ```julia
+    cuNumeric.Experimental(true)
+    ```
+
+The interface has two steps:
+1. **Compile & Register** — [`@cuda_task`](@ref) JIT-compiles a kernel to PTX and registers it with Legate.
+2. **Launch** — [`@launch`](@ref) submits the kernel with grid dimensions, inputs, outputs, and scalars.
+
+`NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy.
+
+!!! warning "Inputs vs. outputs"
+    Correctly separating `inputs` and `outputs` is critical for Legate's
+    dependency analysis. If an array is both read and written, list it as an `output`.
+
+!!! warning "Array sizes"
+    Mismatched array sizes are automatically padded to the largest shape. To address this, we plan to add support for other Legate constraints in the future (more information [here](https://docs.nvidia.com/legate/latest/api/cpp/generated/group/group__partitioning.html)).
+
+## Example
+
+```julia
+using cuNumeric
+using CUDA
+import CUDA: i32
+
+# Enable experimental features
+cuNumeric.Experimental(true)
+
+# 1. Write a standard CUDA.jl kernel
+function kernel_sin(a, b, N)
+    i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
+    if i <= N
+        @inbounds b[i] = sin(a[i])
+    end
+    return nothing
+end
+
+N = 1024
+threads = 256
+blocks = cld(N, threads)
+
+a = cuNumeric.fill(1.0f0, N)
+b = cuNumeric.zeros(Float32, N)
+
+# 2. Compile & register — args are used only for type inference
+task = cuNumeric.@cuda_task kernel_sin(a, b, UInt32(1))
+
+# 3. Launch through Legate
+cuNumeric.@launch task=task threads=threads blocks=blocks inputs=a outputs=b scalars=UInt32(N)
+
+allowscalar() do
+    println("sin(1) = ", b[:][1])  # ≈ 0.8414709
+end
+```
+
+See `examples/custom_cuda.jl` for a more complete example with multiple kernels.
+
+## API Reference
+
+```@autodocs
+Modules = [cuNumeric]
+Pages = ["utilities/cuda_stubs.jl"]
+```
+
 # Internal API
 
 ```@autodocs

diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
@@ -6,7 +6,7 @@ using Legate: Legate
 using CxxWrap: CxxWrap
 using cuNumeric: cuNumeric
 import cuNumeric:
-    @cuda_task, @launch, NDArray
+    @cuda_task, @launch, NDArray, assert_experimental
 
 const KERNEL_OFFSET = sizeof(CUDA.KernelState)
 

diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl
@@ -74,6 +74,11 @@ function check_sz(arr, maxshape)
     end
 end
 
+function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
+    st_handle = cuNumeric.get_store(arr)
+    return Legate.LogicalArray{T,N}(st_handle[], size(arr))
+end
+
 function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
     outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads)
 
@@ -92,16 +97,16 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
     input_vars = Vector{Legate.Variable}()
     for arr in inputs
         check_sz!(arr, max_shape; copy=true)
-        store = cuNumeric.get_store(arr)
-        p = Legate.add_input(task, store)
+        la = nda_to_logical_array(arr)
+        p = Legate.add_input(task, la)
         push!(input_vars, p)
     end
 
     output_vars = Vector{Legate.Variable}()
     for arr in outputs
         check_sz!(arr, max_shape; copy=false)
-        store = cuNumeric.get_store(arr)
-        p = Legate.add_output(task, store)
+        la = nda_to_logical_array(arr)
+        p = Legate.add_output(task, la)
         push!(output_vars, p)
     end
 
@@ -115,8 +120,8 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
         Legate.add_scalar(task, Legate.Scalar(s)) # 7+ -> ARG_OFFSET
     end
 
-    # all inputs are alligned with all outputs
-    Legate.add_default_alignment(task, input_vars, output_vars)
+    # all inputs are aligned with all outputs
+    Legate.default_alignment(task, input_vars, output_vars)
     Legate.submit_auto_task(rt, task)
 end
 
@@ -143,6 +148,8 @@ function cuNumeric.ptx_task(ptx::String, kernel_name)
 end
 
 macro cuda_task(call_expr)
+    cuNumeric.assert_experimental()
+
     fname = call_expr.args[1]
     fargs = call_expr.args[2:end]
 
@@ -164,6 +171,8 @@ macro cuda_task(call_expr)
 end
 
 macro launch(args...)
+    cuNumeric.assert_experimental()
+
     allowed_keys = Set([:task, :blocks, :threads, :inputs, :outputs, :scalars])
     kwargs = Dict{Symbol,Any}()
 

diff --git a/lib/cunumeric_jl_wrapper/src/cuda.cpp b/lib/cunumeric_jl_wrapper/src/cuda.cpp
@@ -29,7 +29,7 @@
 #include "types.h"
 #include "ufi.h"
 
-#define CUDA_DEBUG 1
+#define CUDA_DEBUG 0
 
 #define BLOCK_START 1
 #define THREAD_START 4

diff --git a/src/scoping.jl b/src/scoping.jl
@@ -1,5 +1,3 @@
-# TODO reduce number of allocations. Potentially remove assigned_vars somehow
-
 export @cunumeric
 
 @doc"""

diff --git a/src/util.jl b/src/util.jl
@@ -15,3 +15,18 @@ preceding the call to this function.
 function get_time_nanoseconds()
     return Legate.time_nanoseconds()
 end
+
+function Experimental(setting::Bool)
+    task_local_storage(:Experimental, setting)
+end
+
+function assert_experimental()
+    if get(task_local_storage(), :Experimental, false) !== true
+        throw(
+            ArgumentError(
+                "Experimental features are disabled." *
+                " Use `cuNumeric.Experimental(true)` to enable them.",
+            ),
+        )
+    end
+end
diff --git a/src/warnings.jl b/src/warnings.jl
@@ -1,5 +1,3 @@
-### THE SCALAR INDEXING LOGIC IS COPIED FROM GPUArrays.jl ###
-
 export allowpromotion, @allowpromotion, assertpromotion, allowscalar, @allowscalar, assertscalar
 
 @enum ImplicitPromotion PromotionAllowed PromotionWarn PromotionWarned PromotionDisallowed
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		224cac24f15449a338abebe1bc17ba74c07d9f5c
		17adad8a6f9dfd2a79e3a9297020abeedfe5a10f