diff --git a/.githash b/.githash index 6dd09371..cee7a114 100644 --- a/.githash +++ b/.githash @@ -1 +1 @@ -17adad8a6f9dfd2a79e3a9297020abeedfe5a10f +ff81537a0c8e23806869eef5c28c235b0dc3fbbe diff --git a/README.md b/README.md index 543d857a..88703e7c 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ cuNumeric.versioninfo() ``` > [!WARNING] -> Starting more than one instance of cuNumeric.jl can lead to a hard-crash. The default hardware configuration reserves all available resources. See the [hardware configuration](https://julialegate.github.io/cuNumeric.jl/dev/perf#Setting-Hardware-Configuration) documentation to learn more. +> Starting more than one instance of cuNumeric.jl can lead to a hard-crash. The default hardware configuration reserves all available resources. For more details, please visit our hardware configuration documentation. ### Monte-Carlo Example ```julia diff --git a/docs/src/benchmark.md b/docs/src/benchmark.md index f4a0b845..ac082766 100644 --- a/docs/src/benchmark.md +++ b/docs/src/benchmark.md @@ -15,9 +15,18 @@ Code Outline: mul!(C, A, B) ``` -| GEMM Efficiency | GEMM GFLOPS | -|---|---| -| ![GEMM Efficiency](images/gemm_efficiency.svg) | ![GEMM GFLOPS](images/gemm_gflops.svg) | +```@raw html + + + + + + + + + +
GEMM EfficiencyGEMM GFLOPS
GEMM EfficiencyGEMM GFLOPS
+``` ## Monte-Carlo Integration @@ -29,16 +38,34 @@ integrand = (x) -> exp.(-x.^2) val = (V/N) * sum(integrand(x)) ``` -| MC Efficiency | MC GFLOPS | -|---|---| -| ![MC Efficiency](images/mc_eff.svg) | ![MC GFLOPS](images/mc_ops.svg) | +```@raw html + + + + + + + + + +
MC EfficiencyMC GFLOPS
MC EfficiencyMC GFLOPS
+``` ## Gray-Scott (2D) Solving a PDE requires halo-exchanges and lots of data movement. In this benchmark we fall an order of magnitude short of the `ImplicitGlobalGrid.jl` library which specifically targets multi-node, multi-GPU halo exchanges. We attribute this to the lack of kernel fusion in cuNumeric.jl -![GS GFLOPS](images/gs_gflops_diffeq.svg) +```@raw html + + + + + + + +
GS GFLOPS
GS GFLOPS
+``` # Benchmarking cuNumeric.jl Programs @@ -145,8 +172,3 @@ To generate a weak scaling plot, you must increment the problem size in proporti As part of a more complete benchmark we ran our code on up to 8 A100 GPUs (single-node) and compared it to the Python library cuPyNumeric as well as a custom implementation using CUDA.jl. From these resutls we can see that cuNumeric.jl is capable of scaling and saturating the GPU memory bandwidth for matrix multiplication. - - -| GEMM Efficiency | GEMM GFLOPS | -|---|---| -| ![GEMM Efficiency](images/gemm_efficiency.svg) | ![GEMM GFLOPS](images/gemm_gflops.svg) | diff --git a/examples/gray-scott.jl b/examples/gray-scott.jl index f5865705..b7eae81c 100644 --- a/examples/gray-scott.jl +++ b/examples/gray-scott.jl @@ -1,5 +1,5 @@ using cuNumeric -# using Plots +using Plots struct Params{T} dx::T @@ -63,7 +63,7 @@ function step!(u, v, u_new, v_new, args::Params) end function gray_scott() - #anim = Animation() + anim = Animation() N = 100 dims = (N, N) @@ -78,8 +78,8 @@ function gray_scott() u_new = cuNumeric.zeros(dims) v_new = cuNumeric.zeros(dims) - u[1:15, 1:15] = cuNumeric.rand(15, 15) - v[1:15, 1:15] = cuNumeric.rand(15, 15) + u[1:15, 1:15] = cuNumeric.rand(Float32, 15, 15) + v[1:15, 1:15] = cuNumeric.rand(Float32, 15, 15) for n in 1:n_steps step!(u, v, u_new, v_new, args) @@ -88,13 +88,12 @@ function gray_scott() u, u_new = u_new, u v, v_new = v_new, v - # if n%frame_interval == 0 - # u_cpu = u[:, :] - # heatmap(u_cpu, clims=(0, 1)) - # frame(anim) - # end + if n%frame_interval == 0 + heatmap(Array(u); clims=(0, 1)) + frame(anim) + end end - # gif(anim, "gray-scott.gif", fps=10) + gif(anim, "gray-scott.gif"; fps=10) return u, v end