From 1c73800bf897f5cb13e085a43340958621554a3d Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 8 Apr 2026 17:08:37 -0500
Subject: [PATCH 1/4] cuda jl tasking documentation updates

---
 .githash                |  2 +-
 .pre-commit-config.yaml | 10 +++++
 docs/src/api.md         | 81 ++++++++++++++++++++++++++++++++++++++++-
 ext/CUDAExt/CUDAExt.jl  |  2 +-
 ext/CUDAExt/cuda.jl     |  4 ++
 src/scoping.jl          |  2 -
 src/util.jl             | 15 ++++++++
 src/warnings.jl         |  2 -
 8 files changed, 111 insertions(+), 7 deletions(-)

diff --git a/.githash b/.githash
index 87bef491..e1142e7d 100644
--- a/.githash
+++ b/.githash
@@ -1 +1 @@
-224cac24f15449a338abebe1bc17ba74c07d9f5c
+0c87a32be84ecaf7346c8f4f028eae208a5a7cda
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7a201053..37b95947 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+default_stages: [pre-commit]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.6.0
@@ -21,3 +22,12 @@ repos:
         entry: julia --project=. -e 'using JuliaFormatter; format(ARGS, verbose=true)'
         language: system
         types: [julia]
+
+      - id: update-githash
+        name: Update .githash
+        description: "Write the current git commit hash to .githash"
+        entry: bash -c 'git rev-parse HEAD > .githash'
+        language: system
+        always_run: true
+        pass_filenames: false
+        stages: [post-commit]
diff --git a/docs/src/api.md b/docs/src/api.md
index fbeaa3d5..0ab680b7 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -14,7 +14,86 @@ Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumer
 ```
 
 # CUDA.jl Tasking
-This section will detail how to use custom CUDA.jl kernels with the Legate runtime. This is still a work in progress
+
+Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling.
+
+!!! warning "Experimental Feature"
+    CUDA.jl tasking is experimental. You must opt in before using `@cuda_task` or `@launch`:
+    ```julia
+    cuNumeric.Experimental(true)
+    ```
+
+The interface has two steps:
+1. **Compile & Register** — [`@cuda_task`](@ref) JIT-compiles a kernel to PTX and registers it with Legate.
+2. **Launch** — [`@launch`](@ref) submits the kernel with grid dimensions, inputs, outputs, and scalars.
+
+`NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy.
+
+!!! note "Argument ordering"
+    Legate passes kernel arguments in the order: **inputs → outputs → scalars**.
+    Your kernel signature must match this ordering.
+
+!!! warning "Inputs vs. outputs"
+    Correctly separating `inputs` and `outputs` is critical for Legate's
+    dependency analysis. If an array is both read and written, list it as an `output`.
+
+## Example
+
+```julia
+using cuNumeric
+using CUDA
+import CUDA: i32
+
+# Enable experimental features
+cuNumeric.Experimental(true)
+
+# 1. Write a standard CUDA.jl kernel
+function kernel_sin(a, b, N)
+    i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
+    if i <= N
+        @inbounds b[i] = sin(a[i])
+    end
+    return nothing
+end
+
+N = 1024
+threads = 256
+blocks = cld(N, threads)
+
+a = cuNumeric.fill(1.0f0, N)
+b = cuNumeric.zeros(Float32, N)
+
+# 2. Compile & register — args are used only for type inference
+task = cuNumeric.@cuda_task kernel_sin(a, b, UInt32(1))
+
+# 3. Launch through Legate
+cuNumeric.@launch task=task threads=threads blocks=blocks inputs=a outputs=b scalars=UInt32(N)
+
+allowscalar() do
+    println("sin(1) = ", b[:][1])  # ≈ 0.8414709
+end
+```
+
+See `examples/custom_cuda.jl` for a more complete example with multiple kernels.
+
+## `@launch` Keywords
+
+| Keyword   | Type                 | Default  | Description                    |
+|-----------|----------------------|----------|--------------------------------|
+| `task`    | `CUDATask`           | required | Compiled kernel handle         |
+| `blocks`  | `Int` or `Tuple`     | `(1,)`   | CUDA grid dimensions           |
+| `threads` | `Int` or `Tuple`     | `(256,)` | CUDA block dimensions          |
+| `inputs`  | `NDArray` or `Tuple` | `()`     | Read-only input arrays         |
+| `outputs` | `NDArray` or `Tuple` | `()`     | Read-write output arrays       |
+| `scalars` | scalar or `Tuple`    | `()`     | Scalar kernel arguments        |
+
+## Limitations
+
+- Only `NDArray` objects are supported — raw `CuArray` cannot be passed directly.
+- Mismatched array sizes are automatically padded to the largest shape.
+- Custom function broadcasting is not supported; write explicit index-based kernels.
+
+## API Reference
 
 ```@autodocs
 Modules = [cuNumeric]
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
index e991aac6..ed9c07c6 100644
--- a/ext/CUDAExt/CUDAExt.jl
+++ b/ext/CUDAExt/CUDAExt.jl
@@ -6,7 +6,7 @@ using Legate: Legate
 using CxxWrap: CxxWrap
 using cuNumeric: cuNumeric
 import cuNumeric:
-    @cuda_task, @launch, NDArray
+    @cuda_task, @launch, NDArray, assert_experimental
 
 const KERNEL_OFFSET = sizeof(CUDA.KernelState)
 
diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl
index a70a5b90..fcb7a003 100644
--- a/ext/CUDAExt/cuda.jl
+++ b/ext/CUDAExt/cuda.jl
@@ -143,6 +143,8 @@ function cuNumeric.ptx_task(ptx::String, kernel_name)
 end
 
 macro cuda_task(call_expr)
+    cuNumeric.assert_experimental()
+
     fname = call_expr.args[1]
     fargs = call_expr.args[2:end]
 
@@ -164,6 +166,8 @@ macro cuda_task(call_expr)
 end
 
 macro launch(args...)
+    cuNumeric.assert_experimental()
+
     allowed_keys = Set([:task, :blocks, :threads, :inputs, :outputs, :scalars])
     kwargs = Dict{Symbol,Any}()
 
diff --git a/src/scoping.jl b/src/scoping.jl
index ecac81ca..6983c455 100644
--- a/src/scoping.jl
+++ b/src/scoping.jl
@@ -1,5 +1,3 @@
-# TODO reduce number of allocations. Potentially remove assigned_vars somehow
-
 export @cunumeric
 
 @doc"""
diff --git a/src/util.jl b/src/util.jl
index a47f3ad5..96856ab7 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -15,3 +15,18 @@ preceding the call to this function.
 function get_time_nanoseconds()
     return Legate.time_nanoseconds()
 end
+
+function Experimental(setting::Bool)
+    task_local_storage(:Experimental, setting)
+end
+
+function assert_experimental()
+    if get(task_local_storage(), :Experimental, false) !== true
+        throw(
+            ArgumentError(
+                "Experimental features are disabled." *
+                " Use `cuNumeric.Experimental(true)` to enable them.",
+            ),
+        )
+    end
+end
diff --git a/src/warnings.jl b/src/warnings.jl
index c993571e..db0ae37f 100644
--- a/src/warnings.jl
+++ b/src/warnings.jl
@@ -1,5 +1,3 @@
-### THE SCALAR INDEXING LOGIC IS COPIED FROM GPUArrays.jl ###
-
 export allowpromotion, @allowpromotion, assertpromotion, allowscalar, @allowscalar, assertscalar
 
 @enum ImplicitPromotion PromotionAllowed PromotionWarn PromotionWarned PromotionDisallowed

From 54c8286aed55dc3987e726d5defc67dbb09ff41f Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 8 Apr 2026 17:35:44 -0500
Subject: [PATCH 2/4] fix cuda.jl to support claims made in documentation

---
 .githash                              |  2 +-
 docs/src/api.md                       |  2 +-
 ext/CUDAExt/cuda.jl                   | 17 +++++++++++------
 lib/cunumeric_jl_wrapper/src/cuda.cpp |  2 +-
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.githash b/.githash
index e1142e7d..eb6896ec 100644
--- a/.githash
+++ b/.githash
@@ -1 +1 @@
-0c87a32be84ecaf7346c8f4f028eae208a5a7cda
+1c73800bf897f5cb13e085a43340958621554a3d
diff --git a/docs/src/api.md b/docs/src/api.md
index 0ab680b7..51d5879f 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -97,7 +97,7 @@ See `examples/custom_cuda.jl` for a more complete example with multiple kernels.
 
 ```@autodocs
 Modules = [cuNumeric]
-Pages = ["cuda.jl"]
+Pages = ["utilities/cuda_stubs.jl"]
 ```
 
 # CNPreferences
diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl
index fcb7a003..05d7e9f1 100644
--- a/ext/CUDAExt/cuda.jl
+++ b/ext/CUDAExt/cuda.jl
@@ -74,6 +74,11 @@ function check_sz(arr, maxshape)
     end
 end
 
+function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
+    st_handle = cuNumeric.get_store(arr)
+    return Legate.LogicalArray{T,N}(st_handle[], size(arr))
+end
+
 function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
     outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads)
 
@@ -92,16 +97,16 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
     input_vars = Vector{Legate.Variable}()
     for arr in inputs
         check_sz!(arr, max_shape; copy=true)
-        store = cuNumeric.get_store(arr)
-        p = Legate.add_input(task, store)
+        la = nda_to_logical_array(arr)
+        p = Legate.add_input(task, la)
         push!(input_vars, p)
     end
 
     output_vars = Vector{Legate.Variable}()
     for arr in outputs
         check_sz!(arr, max_shape; copy=false)
-        store = cuNumeric.get_store(arr)
-        p = Legate.add_output(task, store)
+        la = nda_to_logical_array(arr)
+        p = Legate.add_output(task, la)
         push!(output_vars, p)
     end
 
@@ -115,8 +120,8 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
         Legate.add_scalar(task, Legate.Scalar(s)) # 7+ -> ARG_OFFSET
     end
 
-    # all inputs are alligned with all outputs
-    Legate.add_default_alignment(task, input_vars, output_vars)
+    # all inputs are aligned with all outputs
+    Legate.default_alignment(task, input_vars, output_vars)
     Legate.submit_auto_task(rt, task)
 end
 
diff --git a/lib/cunumeric_jl_wrapper/src/cuda.cpp b/lib/cunumeric_jl_wrapper/src/cuda.cpp
index 4d5256bf..d5e779d0 100644
--- a/lib/cunumeric_jl_wrapper/src/cuda.cpp
+++ b/lib/cunumeric_jl_wrapper/src/cuda.cpp
@@ -29,7 +29,7 @@
 #include "types.h"
 #include "ufi.h"
 
-#define CUDA_DEBUG 1
+#define CUDA_DEBUG 0
 
 #define BLOCK_START 1
 #define THREAD_START 4

From 17adad8a6f9dfd2a79e3a9297020abeedfe5a10f Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 8 Apr 2026 17:45:40 -0500
Subject: [PATCH 3/4] change ordering of documentation

---
 .githash        |  2 +-
 docs/src/api.md | 33 +++++++++------------------------
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/.githash b/.githash
index eb6896ec..505cc7e0 100644
--- a/.githash
+++ b/.githash
@@ -1 +1 @@
-1c73800bf897f5cb13e085a43340958621554a3d
+54c8286aed55dc3987e726d5defc67dbb09ff41f
diff --git a/docs/src/api.md b/docs/src/api.md
index 51d5879f..155da1ac 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -13,6 +13,15 @@ Modules = [cuNumeric]
 Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumeric.jl", "warnings.jl", "util.jl", "memory.jl", "scoping.jl"]
 ```
 
+# CNPreferences
+
+This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md).
+
+```@autodocs
+Modules = [CNPreferences]
+Pages = ["CNPreferences.jl"]
+```
+
 # CUDA.jl Tasking
 
 Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling.
@@ -29,10 +38,6 @@ The interface has two steps:
 
 `NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy.
 
-!!! note "Argument ordering"
-    Legate passes kernel arguments in the order: **inputs → outputs → scalars**.
-    Your kernel signature must match this ordering.
-
 !!! warning "Inputs vs. outputs"
     Correctly separating `inputs` and `outputs` is critical for Legate's
     dependency analysis. If an array is both read and written, list it as an `output`.
@@ -76,17 +81,6 @@ end
 
 See `examples/custom_cuda.jl` for a more complete example with multiple kernels.
 
-## `@launch` Keywords
-
-| Keyword   | Type                 | Default  | Description                    |
-|-----------|----------------------|----------|--------------------------------|
-| `task`    | `CUDATask`           | required | Compiled kernel handle         |
-| `blocks`  | `Int` or `Tuple`     | `(1,)`   | CUDA grid dimensions           |
-| `threads` | `Int` or `Tuple`     | `(256,)` | CUDA block dimensions          |
-| `inputs`  | `NDArray` or `Tuple` | `()`     | Read-only input arrays         |
-| `outputs` | `NDArray` or `Tuple` | `()`     | Read-write output arrays       |
-| `scalars` | scalar or `Tuple`    | `()`     | Scalar kernel arguments        |
-
 ## Limitations
 
 - Only `NDArray` objects are supported — raw `CuArray` cannot be passed directly.
@@ -100,15 +94,6 @@ Modules = [cuNumeric]
 Pages = ["utilities/cuda_stubs.jl"]
 ```
 
-# CNPreferences
-
-This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md).
-
-```@autodocs
-Modules = [CNPreferences]
-Pages = ["CNPreferences.jl"]
-```
-
 # Internal API
 
 ```@autodocs

From a54b006495ac58dd42c7b85cbbb38d00af89775a Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 8 Apr 2026 17:59:16 -0500
Subject: [PATCH 4/4] even more simple

---
 .githash        | 2 +-
 docs/src/api.md | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.githash b/.githash
index 505cc7e0..6dd09371 100644
--- a/.githash
+++ b/.githash
@@ -1 +1 @@
-54c8286aed55dc3987e726d5defc67dbb09ff41f
+17adad8a6f9dfd2a79e3a9297020abeedfe5a10f
diff --git a/docs/src/api.md b/docs/src/api.md
index 155da1ac..91ae5d5a 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -42,6 +42,9 @@ The interface has two steps:
     Correctly separating `inputs` and `outputs` is critical for Legate's
     dependency analysis. If an array is both read and written, list it as an `output`.
 
+!!! warning "Array sizes"
+    Mismatched array sizes are automatically padded to the largest shape. To address this, we plan to add support for other Legate constraints in the future (more information [here](https://docs.nvidia.com/legate/latest/api/cpp/generated/group/group__partitioning.html)).
+
 ## Example
 
 ```julia
@@ -81,12 +84,6 @@ end
 
 See `examples/custom_cuda.jl` for a more complete example with multiple kernels.
 
-## Limitations
-
-- Only `NDArray` objects are supported — raw `CuArray` cannot be passed directly.
-- Mismatched array sizes are automatically padded to the largest shape.
-- Custom function broadcasting is not supported; write explicit index-based kernels.
-
 ## API Reference
 
 ```@autodocs