diff --git a/.githash b/.githash index 87bef491..6dd09371 100644 --- a/.githash +++ b/.githash @@ -1 +1 @@ -224cac24f15449a338abebe1bc17ba74c07d9f5c +17adad8a6f9dfd2a79e3a9297020abeedfe5a10f diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7a201053..37b95947 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,4 @@ +default_stages: [pre-commit] repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 @@ -21,3 +22,12 @@ repos: entry: julia --project=. -e 'using JuliaFormatter; format(ARGS, verbose=true)' language: system types: [julia] + + - id: update-githash + name: Update .githash + description: "Write the current git commit hash to .githash" + entry: bash -c 'git rev-parse HEAD > .githash' + language: system + always_run: true + pass_filenames: false + stages: [post-commit] diff --git a/docs/src/api.md b/docs/src/api.md index fbeaa3d5..91ae5d5a 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -13,14 +13,6 @@ Modules = [cuNumeric] Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumeric.jl", "warnings.jl", "util.jl", "memory.jl", "scoping.jl"] ``` -# CUDA.jl Tasking -This section will detail how to use custom CUDA.jl kernels with the Legate runtime. This is still a work in progress - -```@autodocs -Modules = [cuNumeric] -Pages = ["cuda.jl"] -``` - # CNPreferences This section details how to set custom build configuration options. To see more details visit our install guide [here](./install.md). @@ -30,6 +22,75 @@ Modules = [CNPreferences] Pages = ["CNPreferences.jl"] ``` +# CUDA.jl Tasking + +Write custom GPU kernels in Julia using CUDA.jl and execute them through the Legate distributed runtime. Your kernels automatically benefit from Legate's data partitioning, dependency tracking, and multi-GPU scheduling. + +!!! warning "Experimental Feature" + CUDA.jl tasking is experimental. You must opt in before using `@cuda_task` or `@launch`: + ```julia + cuNumeric.Experimental(true) + ``` + +The interface has two steps: +1. **Compile & Register** — [`@cuda_task`](@ref) JIT-compiles a kernel to PTX and registers it with Legate. +2. **Launch** — [`@launch`](@ref) submits the kernel with grid dimensions, inputs, outputs, and scalars. + +`NDArray` arguments are automatically mapped to their CUDA equivalents (`NDArray{T,1}` → `CuDeviceVector{T,1}`, etc.). Scalar arguments are passed through by copy. + +!!! warning "Inputs vs. outputs" + Correctly separating `inputs` and `outputs` is critical for Legate's + dependency analysis. If an array is both read and written, list it as an `output`. + +!!! warning "Array sizes" + Mismatched array sizes are automatically padded to the largest shape. To address this, we plan to add support for other Legate constraints in the future (more information [here](https://docs.nvidia.com/legate/latest/api/cpp/generated/group/group__partitioning.html)). + +## Example + +```julia +using cuNumeric +using CUDA +import CUDA: i32 + +# Enable experimental features +cuNumeric.Experimental(true) + +# 1. Write a standard CUDA.jl kernel +function kernel_sin(a, b, N) + i = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x + if i <= N + @inbounds b[i] = sin(a[i]) + end + return nothing +end + +N = 1024 +threads = 256 +blocks = cld(N, threads) + +a = cuNumeric.fill(1.0f0, N) +b = cuNumeric.zeros(Float32, N) + +# 2. Compile & register — args are used only for type inference +task = cuNumeric.@cuda_task kernel_sin(a, b, UInt32(1)) + +# 3. Launch through Legate +cuNumeric.@launch task=task threads=threads blocks=blocks inputs=a outputs=b scalars=UInt32(N) + +allowscalar() do + println("sin(1) = ", b[:][1]) # ≈ 0.8414709 +end +``` + +See `examples/custom_cuda.jl` for a more complete example with multiple kernels. + +## API Reference + +```@autodocs +Modules = [cuNumeric] +Pages = ["utilities/cuda_stubs.jl"] +``` + # Internal API ```@autodocs diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index e991aac6..ed9c07c6 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -6,7 +6,7 @@ using Legate: Legate using CxxWrap: CxxWrap using cuNumeric: cuNumeric import cuNumeric: - @cuda_task, @launch, NDArray + @cuda_task, @launch, NDArray, assert_experimental const KERNEL_OFFSET = sizeof(CUDA.KernelState) diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl index a70a5b90..05d7e9f1 100644 --- a/ext/CUDAExt/cuda.jl +++ b/ext/CUDAExt/cuda.jl @@ -74,6 +74,11 @@ function check_sz(arr, maxshape) end end +function nda_to_logical_array(arr::NDArray{T,N}) where {T,N} + st_handle = cuNumeric.get_store(arr) + return Legate.LogicalArray{T,N}(st_handle[], size(arr)) +end + function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads) @@ -92,16 +97,16 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, input_vars = Vector{Legate.Variable}() for arr in inputs check_sz!(arr, max_shape; copy=true) - store = cuNumeric.get_store(arr) - p = Legate.add_input(task, store) + la = nda_to_logical_array(arr) + p = Legate.add_input(task, la) push!(input_vars, p) end output_vars = Vector{Legate.Variable}() for arr in outputs check_sz!(arr, max_shape; copy=false) - store = cuNumeric.get_store(arr) - p = Legate.add_output(task, store) + la = nda_to_logical_array(arr) + p = Legate.add_output(task, la) push!(output_vars, p) end @@ -115,8 +120,8 @@ function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, Legate.add_scalar(task, Legate.Scalar(s)) # 7+ -> ARG_OFFSET end - # all inputs are alligned with all outputs - Legate.add_default_alignment(task, input_vars, output_vars) + # all inputs are aligned with all outputs + Legate.default_alignment(task, input_vars, output_vars) Legate.submit_auto_task(rt, task) end @@ -143,6 +148,8 @@ function cuNumeric.ptx_task(ptx::String, kernel_name) end macro cuda_task(call_expr) + cuNumeric.assert_experimental() + fname = call_expr.args[1] fargs = call_expr.args[2:end] @@ -164,6 +171,8 @@ macro cuda_task(call_expr) end macro launch(args...) + cuNumeric.assert_experimental() + allowed_keys = Set([:task, :blocks, :threads, :inputs, :outputs, :scalars]) kwargs = Dict{Symbol,Any}() diff --git a/lib/cunumeric_jl_wrapper/src/cuda.cpp b/lib/cunumeric_jl_wrapper/src/cuda.cpp index 4d5256bf..d5e779d0 100644 --- a/lib/cunumeric_jl_wrapper/src/cuda.cpp +++ b/lib/cunumeric_jl_wrapper/src/cuda.cpp @@ -29,7 +29,7 @@ #include "types.h" #include "ufi.h" -#define CUDA_DEBUG 1 +#define CUDA_DEBUG 0 #define BLOCK_START 1 #define THREAD_START 4 diff --git a/src/scoping.jl b/src/scoping.jl index ecac81ca..6983c455 100644 --- a/src/scoping.jl +++ b/src/scoping.jl @@ -1,5 +1,3 @@ -# TODO reduce number of allocations. Potentially remove assigned_vars somehow - export @cunumeric @doc""" diff --git a/src/util.jl b/src/util.jl index a47f3ad5..96856ab7 100644 --- a/src/util.jl +++ b/src/util.jl @@ -15,3 +15,18 @@ preceding the call to this function. function get_time_nanoseconds() return Legate.time_nanoseconds() end + +function Experimental(setting::Bool) + task_local_storage(:Experimental, setting) +end + +function assert_experimental() + if get(task_local_storage(), :Experimental, false) !== true + throw( + ArgumentError( + "Experimental features are disabled." * + " Use `cuNumeric.Experimental(true)` to enable them.", + ), + ) + end +end diff --git a/src/warnings.jl b/src/warnings.jl index c993571e..db0ae37f 100644 --- a/src/warnings.jl +++ b/src/warnings.jl @@ -1,5 +1,3 @@ -### THE SCALAR INDEXING LOGIC IS COPIED FROM GPUArrays.jl ### - export allowpromotion, @allowpromotion, assertpromotion, allowscalar, @allowscalar, assertscalar @enum ImplicitPromotion PromotionAllowed PromotionWarn PromotionWarned PromotionDisallowed