diff --git a/.githash b/.githash
index 6dd09371..cee7a114 100644
--- a/.githash
+++ b/.githash
@@ -1 +1 @@
-17adad8a6f9dfd2a79e3a9297020abeedfe5a10f
+ff81537a0c8e23806869eef5c28c235b0dc3fbbe
diff --git a/README.md b/README.md
index 543d857a..88703e7c 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ cuNumeric.versioninfo()
```
> [!WARNING]
-> Starting more than one instance of cuNumeric.jl can lead to a hard-crash. The default hardware configuration reserves all available resources. See the [hardware configuration](https://julialegate.github.io/cuNumeric.jl/dev/perf#Setting-Hardware-Configuration) documentation to learn more.
+> Starting more than one instance of cuNumeric.jl can lead to a hard-crash. The default hardware configuration reserves all available resources. For more details, please visit our hardware configuration documentation.
### Monte-Carlo Example
```julia
diff --git a/docs/src/benchmark.md b/docs/src/benchmark.md
index f4a0b845..ac082766 100644
--- a/docs/src/benchmark.md
+++ b/docs/src/benchmark.md
@@ -15,9 +15,18 @@ Code Outline:
mul!(C, A, B)
```
-| GEMM Efficiency | GEMM GFLOPS |
-|---|---|
-|  |  |
+```@raw html
+
+
+ | GEMM Efficiency |
+ GEMM GFLOPS |
+
+
+  |
+  |
+
+
+```
## Monte-Carlo Integration
@@ -29,16 +38,34 @@ integrand = (x) -> exp.(-x.^2)
val = (V/N) * sum(integrand(x))
```
-| MC Efficiency | MC GFLOPS |
-|---|---|
-|  |  |
+```@raw html
+
+
+ | MC Efficiency |
+ MC GFLOPS |
+
+
+  |
+  |
+
+
+```
## Gray-Scott (2D)
Solving a PDE requires halo-exchanges and lots of data movement. In this benchmark we fall an order of magnitude short of the `ImplicitGlobalGrid.jl` library which specifically targets multi-node, multi-GPU halo exchanges. We attribute this to the lack of kernel fusion in cuNumeric.jl
-
+```@raw html
+
+
+ | GS GFLOPS |
+
+
+  |
+
+
+```
# Benchmarking cuNumeric.jl Programs
@@ -145,8 +172,3 @@ To generate a weak scaling plot, you must increment the problem size in proporti
As part of a more complete benchmark we ran our code on up to 8 A100 GPUs (single-node) and compared it to the Python library cuPyNumeric as well as a custom implementation using CUDA.jl. From these resutls we can see that cuNumeric.jl is capable of scaling and saturating the GPU memory bandwidth for matrix multiplication.
-
-
-| GEMM Efficiency | GEMM GFLOPS |
-|---|---|
-|  |  |
diff --git a/examples/gray-scott.jl b/examples/gray-scott.jl
index f5865705..b7eae81c 100644
--- a/examples/gray-scott.jl
+++ b/examples/gray-scott.jl
@@ -1,5 +1,5 @@
using cuNumeric
-# using Plots
+using Plots
struct Params{T}
dx::T
@@ -63,7 +63,7 @@ function step!(u, v, u_new, v_new, args::Params)
end
function gray_scott()
- #anim = Animation()
+ anim = Animation()
N = 100
dims = (N, N)
@@ -78,8 +78,8 @@ function gray_scott()
u_new = cuNumeric.zeros(dims)
v_new = cuNumeric.zeros(dims)
- u[1:15, 1:15] = cuNumeric.rand(15, 15)
- v[1:15, 1:15] = cuNumeric.rand(15, 15)
+ u[1:15, 1:15] = cuNumeric.rand(Float32, 15, 15)
+ v[1:15, 1:15] = cuNumeric.rand(Float32, 15, 15)
for n in 1:n_steps
step!(u, v, u_new, v_new, args)
@@ -88,13 +88,12 @@ function gray_scott()
u, u_new = u_new, u
v, v_new = v_new, v
- # if n%frame_interval == 0
- # u_cpu = u[:, :]
- # heatmap(u_cpu, clims=(0, 1))
- # frame(anim)
- # end
+ if n%frame_interval == 0
+ heatmap(Array(u); clims=(0, 1))
+ frame(anim)
+ end
end
- # gif(anim, "gray-scott.gif", fps=10)
+ gif(anim, "gray-scott.gif"; fps=10)
return u, v
end