Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .githash
Original file line number Diff line number Diff line change
@@ -1 +1 @@
224cac24f15449a338abebe1bc17ba74c07d9f5c
17adad8a6f9dfd2a79e3a9297020abeedfe5a10f
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
default_stages: [pre-commit]
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
Expand All @@ -21,3 +22,12 @@ repos:
entry: julia --project=. -e 'using JuliaFormatter; format(ARGS, verbose=true)'
language: system
types: [julia]

- id: update-githash
name: Update .githash
description: "Write the current git commit hash to .githash"
entry: bash -c 'git rev-parse HEAD > .githash'
language: system
always_run: true
pass_filenames: false
stages: [post-commit]
77 changes: 69 additions & 8 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@ Modules = [cuNumeric]
Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumeric.jl", "warnings.jl", "util.jl", "memory.jl", "scoping.jl"]
```

# CUDA.jl Tasking
This section will detail how to use custom CUDA.jl kernels with the Legate runtime. This is still a work in progress

```@autodocs
Modules = [cuNumeric]
Pages = ["cuda.jl"]
```

# CNPreferences

This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md).
Expand All @@ -30,6 +22,75 @@ Modules = [CNPreferences]
Pages = ["CNPreferences.jl"]
```

# CUDA.jl Tasking

Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling.

!!! warning "Experimental Feature"
CUDA.jl tasking is experimental. You must opt in before using `@cuda_task` or `@launch`:
```julia
cuNumeric.Experimental(true)
```

The interface has two steps:
1. **Compile & Register** — [`@cuda_task`](@ref) JIT-compiles a kernel to PTX and registers it with Legate.
2. **Launch** — [`@launch`](@ref) submits the kernel with grid dimensions, inputs, outputs, and scalars.

`NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy.

!!! warning "Inputs vs. outputs"
Correctly separating `inputs` and `outputs` is critical for Legate's
dependency analysis. If an array is both read and written, list it as an `output`.

!!! warning "Array sizes"
Mismatched array sizes are automatically padded to the largest shape. To address this, we plan to add support for other Legate constraints in the future (more information [here](https://docs.nvidia.com/legate/latest/api/cpp/generated/group/group__partitioning.html)).

## Example

```julia
using cuNumeric
using CUDA
import CUDA: i32

# Enable experimental features
cuNumeric.Experimental(true)

# 1. Write a standard CUDA.jl kernel
function kernel_sin(a, b, N)
i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
if i <= N
@inbounds b[i] = sin(a[i])
end
return nothing
end

N = 1024
threads = 256
blocks = cld(N, threads)

a = cuNumeric.fill(1.0f0, N)
b = cuNumeric.zeros(Float32, N)

# 2. Compile & register — args are used only for type inference
task = cuNumeric.@cuda_task kernel_sin(a, b, UInt32(1))

# 3. Launch through Legate
cuNumeric.@launch task=task threads=threads blocks=blocks inputs=a outputs=b scalars=UInt32(N)

allowscalar() do
println("sin(1) = ", b[:][1]) # ≈ 0.8414709
end
```

See `examples/custom_cuda.jl` for a more complete example with multiple kernels.

## API Reference

```@autodocs
Modules = [cuNumeric]
Pages = ["utilities/cuda_stubs.jl"]
```

# Internal API

```@autodocs
Expand Down
2 changes: 1 addition & 1 deletion ext/CUDAExt/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using Legate: Legate
using CxxWrap: CxxWrap
using cuNumeric: cuNumeric
import cuNumeric:
@cuda_task, @launch, NDArray
@cuda_task, @launch, NDArray, assert_experimental

const KERNEL_OFFSET = sizeof(CUDA.KernelState)

Expand Down
21 changes: 15 additions & 6 deletions ext/CUDAExt/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ function check_sz(arr, maxshape)
end
end

function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
st_handle = cuNumeric.get_store(arr)
return Legate.LogicalArray{T,N}(st_handle[], size(arr))
end

function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads)

Expand All @@ -92,16 +97,16 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
input_vars = Vector{Legate.Variable}()
for arr in inputs
check_sz!(arr, max_shape; copy=true)
store = cuNumeric.get_store(arr)
p = Legate.add_input(task, store)
la = nda_to_logical_array(arr)
p = Legate.add_input(task, la)
push!(input_vars, p)
end

output_vars = Vector{Legate.Variable}()
for arr in outputs
check_sz!(arr, max_shape; copy=false)
store = cuNumeric.get_store(arr)
p = Legate.add_output(task, store)
la = nda_to_logical_array(arr)
p = Legate.add_output(task, la)
push!(output_vars, p)
end

Expand All @@ -115,8 +120,8 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
Legate.add_scalar(task, Legate.Scalar(s)) # 7+ -> ARG_OFFSET
end

# all inputs are alligned with all outputs
Legate.add_default_alignment(task, input_vars, output_vars)
# all inputs are aligned with all outputs
Legate.default_alignment(task, input_vars, output_vars)
Legate.submit_auto_task(rt, task)
end

Expand All @@ -143,6 +148,8 @@ function cuNumeric.ptx_task(ptx::String, kernel_name)
end

macro cuda_task(call_expr)
cuNumeric.assert_experimental()

fname = call_expr.args[1]
fargs = call_expr.args[2:end]

Expand All @@ -164,6 +171,8 @@ macro cuda_task(call_expr)
end

macro launch(args...)
cuNumeric.assert_experimental()

allowed_keys = Set([:task, :blocks, :threads, :inputs, :outputs, :scalars])
kwargs = Dict{Symbol,Any}()

Expand Down
2 changes: 1 addition & 1 deletion lib/cunumeric_jl_wrapper/src/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include "types.h"
#include "ufi.h"

#define CUDA_DEBUG 1
#define CUDA_DEBUG 0

#define BLOCK_START 1
#define THREAD_START 4
Expand Down
2 changes: 0 additions & 2 deletions src/scoping.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# TODO reduce number of allocations. Potentially remove assigned_vars somehow

export @cunumeric

@doc"""
Expand Down
15 changes: 15 additions & 0 deletions src/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,18 @@ preceding the call to this function.
function get_time_nanoseconds()
return Legate.time_nanoseconds()
end

function Experimental(setting::Bool)
task_local_storage(:Experimental, setting)
end

function assert_experimental()
if get(task_local_storage(), :Experimental, false) !== true
throw(
ArgumentError(
"Experimental features are disabled." *
" Use `cuNumeric.Experimental(true)` to enable them.",
),
)
end
end
2 changes: 0 additions & 2 deletions src/warnings.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
### THE SCALAR INDEXING LOGIC IS COPIED FROM GPUArrays.jl ###

export allowpromotion, @allowpromotion, assertpromotion, allowscalar, @allowscalar, assertscalar

@enum ImplicitPromotion PromotionAllowed PromotionWarn PromotionWarned PromotionDisallowed
Expand Down
Loading