diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 08e3436c..11dbae28 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: - JuliaCI/julia#v1: version: "{{matrix.julia}}" - jquick/pre-hook#v1.2.0: - command: "julia --project -e 'using Pkg; Pkg.add(;url=\"https://github.com/JuliaLegate/Legate.jl.git\"); Pkg.instantiate()'" + command: "julia --project -e 'using Pkg; Pkg.resolve(); Pkg.instantiate()'" - JuliaCI/julia-test#v1: test_args: "--quickfail" - JuliaCI/julia-coverage#v1: @@ -26,16 +26,18 @@ steps: if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 45 env: + LD_LIBRARY_PATH: "" + LEGATE_AUTO_CONFIG: "0" LEGATE_SHOW_CONFIG: "1" GPUTESTS: "1" LEGATE_TEST: "1" - LEGATE_CONFIG: "--cpus 1 --gpus 1 --utility 1 --fbmem 2000" + LEGATE_CONFIG: "--cpus 1 --gpus 1 --utility 1 --fbmem 500" matrix: setup: julia: - "1.10" - "1.11" - # - "1.12" + - "1.12" arch: - "x64" # adjustments: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3b9ff34..0daab8c4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,7 +44,7 @@ jobs: julia: - '1.10' - '1.11' - # - '1.12' + - '1.12' os: - ubuntu-latest # include: diff --git a/Project.toml b/Project.toml index 59bc50ba..ab4c1da2 100644 --- a/Project.toml +++ b/Project.toml @@ -31,14 +31,14 @@ CNPreferences = "0.1.2" CUDA = "5.9" CxxWrap = "0.17" JuliaFormatter = "2.3.0" -Legate = "0.1.0" -LegatePreferences = "0.1.5" +Legate = "0.1.1" +LegatePreferences = "0.1.6" MacroTools = "0.5.16" OpenBLAS32_jll = "0.3" -Pkg = "1.10 - 1.11" +Pkg = "1" Preferences = "1" Random = "1" StatsBase = "0.34" -cunumeric_jl_wrapper_jll = "25.10.2" +cunumeric_jl_wrapper_jll = "25.10.3" cupynumeric_jll = "25.10.2" -julia = "1.10 - 1.11" +julia = "1.10" diff --git a/docs/make.jl b/docs/make.jl index a5bd8f3b..e1a7f0dd 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,10 +14,9 @@ makedocs(; ), pages=[ "Home" => "index.md", - "Install Guide" => "install.md", "Examples" => "examples.md", "Performance Tips" => "perf.md", - "Back End Details" => "usage.md", + "Custom Installalation" => "install.md", "Benchmarks" => "benchmark.md", "Public API" => "api.md", ], diff --git a/docs/src/api.md b/docs/src/api.md index fbeaa3d5..89c40519 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -8,6 +8,25 @@ Pages = ["api.md"] Depth = 2:2 ``` +### Supported Unary Operations +The following unary operations are supported and can be broadcast over `NDArray`: + + • `-`, `!`, `abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atanh`, `cbrt`, `conj`, `cos`, `cosh`, `deg2rad`, `exp`, `exp2`, `expm1`, `floor`, `imag`, `isfinite`, `log`, `log10`, `log1p`, `log2`, `rad2deg`, `real`, `sign`, `signbit`, `sin`, `sinh`, `sqrt`, `tan`, `tanh`, `^2`, `^-1` or `inv`, + +##### Differences +- The `acosh` function in Julia will error on inputs outside of the domain (x >= 1) + but cuNumeric.jl will return NaN. + + + +### Supported Binary Operations +The following binary operations are supported and can be applied elementwise to pairs of `NDArray` values: + + • `+`, `-`, `*`, `/`, `^`, `<`, `<=`, `>`, `>=`, `==`, `!=`, `atan`, `hypot`, `max`, `min`, `lcm`, `gcd` + +These operations are applied elementwise by default and follow standard Julia semantics. + + ```@autodocs Modules = [cuNumeric] Pages = ["ndarray/ndarray.jl", "ndarray/unary.jl", "ndarray/binary.jl", "cuNumeric.jl", "warnings.jl", "util.jl", "memory.jl", "scoping.jl"] diff --git a/docs/src/benchmark.md b/docs/src/benchmark.md index d47fb94e..f4a0b845 100644 --- a/docs/src/benchmark.md +++ b/docs/src/benchmark.md @@ -15,9 +15,9 @@ Code Outline: mul!(C, A, B) ``` -GEMM Efficiency | GEMM GFLOPS -:-------------------------:|:-------------------------: -![GEMM Efficiency](images/gemm_efficiency.svg) | ![GEMM GFLOPS](images/gemm_gflops.svg) +| GEMM Efficiency | GEMM GFLOPS | +|---|---| +| ![GEMM Efficiency](images/gemm_efficiency.svg) | ![GEMM GFLOPS](images/gemm_gflops.svg) | ## Monte-Carlo Integration @@ -29,9 +29,9 @@ integrand = (x) -> exp.(-x.^2) val = (V/N) * sum(integrand(x)) ``` -MC Efficiency | MC GFLOPS -:-------------------------:|:-------------------------: -![MC Efficiency](images/mc_eff.svg) | ![MC GFLOPS](images/mc_ops.svg) +| MC Efficiency | MC GFLOPS | +|---|---| +| ![MC Efficiency](images/mc_eff.svg) | ![MC GFLOPS](images/mc_ops.svg) | ## Gray-Scott (2D) @@ -147,6 +147,6 @@ To generate a weak scaling plot, you must increment the problem size in proporti As part of a more complete benchmark we ran our code on up to 8 A100 GPUs (single-node) and compared it to the Python library cuPyNumeric as well as a custom implementation using CUDA.jl. From these resutls we can see that cuNumeric.jl is capable of scaling and saturating the GPU memory bandwidth for matrix multiplication. -GEMM Efficiency | GEMM GFLOPS -:-------------------------:|:-------------------------: -![GEMM Efficiency](images/gemm_efficiency.svg) | ![GEMM GFLOPS](images/gemm_gflops.svg) +| GEMM Efficiency | GEMM GFLOPS | +|---|---| +| ![GEMM Efficiency](images/gemm_efficiency.svg) | ![GEMM GFLOPS](images/gemm_gflops.svg) | diff --git a/docs/src/errors.md b/docs/src/errors.md index 41f0059e..61c65e2d 100644 --- a/docs/src/errors.md +++ b/docs/src/errors.md @@ -1,4 +1,4 @@ # Common Errors ## OOM on Startup -If you have other processes using GPU RAM (e.g. another instance of cuNumeric.jl) then cuNumeric.jl will fail to start and will segfault. The first symbol is typically something like `_ZN5Realm4CudaL22allocate_device_memoryEPNS0_3GPUEm`. You can fix this by killing the other jobs or modifying the amount of GPU RAM requested in `LEGATE_CONFIG`. See the [usage](./usage.md) documentation for examples on how to set the `LEGATE_CONFIG` environment variable. +If you have other processes using GPU RAM (e.g. another instance of cuNumeric.jl) then cuNumeric.jl will fail to start and will segfault. The first symbol is typically something like `_ZN5Realm4CudaL22allocate_device_memoryEPNS0_3GPUEm`. You can fix this by killing the other jobs or modifying the amount of GPU RAM requested in `LEGATE_CONFIG`. See the [performance](./perf.md) documentation for examples on how to set the `LEGATE_CONFIG` environment variable. diff --git a/docs/src/perf.md b/docs/src/perf.md index 71411858..0876b9e8 100644 --- a/docs/src/perf.md +++ b/docs/src/perf.md @@ -6,5 +6,18 @@ Accessing elements of an NDArray one at a time (e.g., `arr[5]`) is slow and shou ## Avoid Implicit Promotion Mixing integral types of different size (e.g., `Float64` and `Float32`) will result in implicit promotion of the smaller type to the larger types. This creates a copy of the data and hurts performance. Implicit promotion from a smaller integral type to a larger integral type will emit an error which can be opted out of with `@allowpromotion` or `allowpromotion() do ... end`. This error is common when mixing literals with `NDArrays`. By default a floating point literal (i.e., 1.0) is `Float64` but the default type of an `NDArray` is `Float32`. +## Setting Hardware Configuration + +There is no programatic way to set the hardware configuration used by CuPyNumeric (as of 26.01). By default, the hardware configuration is set automatically by Legate. This configuration can be manipulated through the following environment variables: + +- `LEGATE_SHOW_CONFIG` : When set to 1, the Legate config is printed to stdout +- `LEGATE_AUTO_CONFIG`: When set to 1, Legate will automatically choose the hardware configuration +- `LEGATE_CONFIG`: A string representing the hardware configuration to set + +These variables must be set before launching the Julia instance running cuNumeric.jl. We recommend setting `export LEGATE_SHOW_CONFIG=1` so that the hardware configuration will be printed when Legate starts. This output is automatically captured and relayed to the user. + +To manually set the hardware configuration, `export LEGATE_AUTO_CONFIG=0`, and then define your own config with something like `export LEGATE_CONFIG="--gpus 1 --cpus 10 --ompthreads 10"`. We recommend using the default memory configuration for your machine and only settings the `gpus`, `cpus` and `ompthreads`. More details about the Legate configuration can be found in the [NVIDIA Legate documentation](https://docs.nvidia.com/legate/latest/usage.html#resource-allocation). If you know where Legate is installed on your computer you can also run `legate --help` for more detailed information. + + ## Kernel Fusion cuPyNumeric does not fuse independent operations automatically, even in broadcast expressions. This is a priority for a future release. diff --git a/docs/src/usage.md b/docs/src/usage.md deleted file mode 100644 index 464e2f80..00000000 --- a/docs/src/usage.md +++ /dev/null @@ -1,12 +0,0 @@ - -## Setting Hardware Configuration - -There is no programatic way to set the hardware configuration used by CuPyNumeric (as of 26.01). By default, the hardware configuration is set automatically by Legate. This configuration can be manipulated through the following environment variables: - -- `LEGATE_SHOW_CONFIG` : When set to 1, the Legate config is printed to stdout -- `LEGATE_AUTO_CONFIG`: When set to 1, Legate will automatically choose the hardware configuration -- `LEGATE_CONFIG`: A string representing the hardware configuration to set - -These variables must be set before launching the Julia instance running cuNumeric.jl. We recommend setting `export LEGATE_SHOW_CONFIG=1` so that the hardware configuration will be printed when Legate starts. This output is automatically captured and relayed to the user. - -To manually set the hardware configuration, `export LEGATE_AUTO_CONFIG=0`, and then define your own config with something like `export LEGATE_CONFIG="--gpus 1 --cpus 10 --ompthreads 10"`. We recommend using the default memory configuration for your machine and only settings the `gpus`, `cpus` and `ompthreads`. More details about the Legate configuration can be found in the [NVIDIA Legate documentation](https://docs.nvidia.com/legate/latest/usage.html#resource-allocation). If you know where Legate is installed on your computer you can also run `legate --help` for more detailed information. diff --git a/src/ndarray/binary.jl b/src/ndarray/binary.jl index 847ba6cf..c04b515b 100644 --- a/src/ndarray/binary.jl +++ b/src/ndarray/binary.jl @@ -1,44 +1,3 @@ -@doc""" -Supported Binary Operations -=========================== - -The following binary operations are supported and can be applied elementwise to pairs of `NDArray` values: - - • `+` - • `-` - • `*` - • `/` - • `^` - • `<` - • `<=` - • `>` - • `>=` - • `==` - • `!=` - • `atan` - • `hypot` - • `max` - • `min` - • `lcm` - • `gcd` - -These operations are applied elementwise by default and follow standard Julia semantics. - -Examples --------- - -```julia -A = NDArray(randn(Float64, 4)) -B = NDArray(randn(Float64, 4)) - -A + B -A / B -hypot.(A, B) -div.(A, B) -A .^ 2 -``` -""" - # Still missing: # # Base.copysign => cuNumeric.COPYSIGN, #* ANNOYING TO TEST # #missing => cuNumeric.fmod, #same as mod in Julia? diff --git a/src/ndarray/unary.jl b/src/ndarray/unary.jl index 504a9cf2..6cc7b749 100644 --- a/src/ndarray/unary.jl +++ b/src/ndarray/unary.jl @@ -1,60 +1,3 @@ -export square - -@doc""" -Supported Unary Operations -=========================== - -The following unary operations are supported and can be broadcast over `NDArray`: - - - `-` (negation) - - `!` (logical not) - - `abs` - - `acos` - - `acosh` - - `asin` - - `asinh` - - `atan` - - `atanh` - - `cbrt` - - `cos` - - `cosh` - - `deg2rad` - - `exp` - - `exp2` - - `expm1` - - `floor` - - `isfinite` - - `log` - - `log10` - - `log1p` - - `log2` - - `rad2deg` - - `sign` - - `signbit` - - `sin` - - `sinh` - - `sqrt` - - `tan` - - `tanh` - - `^2` - - `^-1` or `inv` - -Differences ------------ -- The `acosh` function in Julia will error on inputs outside of the domain (x >= 1) - but cuNumeric.jl will return NaN. - -Examples --------- - -```julia -A = cuNumeric.ones(Float32, 3, 3) - -abs.(A) -log.(A .+ 1) --sqrt.(abs.(A)) -``` -""" global const floaty_unary_ops_no_args = Dict{Function,UnaryOpCode}( Base.acos => cuNumeric.ARCCOS, Base.acosh => cuNumeric.ARCCOSH,