From 3838ef89229713eba2ae3e39fdaf9dd350a0ce22 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Wed, 10 Apr 2024 18:48:40 +0200 Subject: [PATCH 1/6] adding necessary changes for KA transition for gpuarrays --- src/gpuarrays.jl | 55 +++++++++++------------------------------------- 1 file changed, 12 insertions(+), 43 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index d8aaae548..a9ac94ebd 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,14 +1,20 @@ ## GPUArrays interfaces -## execution +GPUArrays.device(x::MtlArray) = x.dev -struct mtlArrayBackend <: AbstractGPUBackend end +import KernelAbstractions +import KernelAbstractions: Backend -struct mtlKernelContext <: AbstractKernelContext end - -@inline function GPUArrays.launch_heuristic(::mtlArrayBackend, f::F, args::Vararg{Any,N}; +@inline function GPUArrays.launch_heuristic(::MetalBackend, f::F, args::Vararg{Any,N}; elements::Int, elements_per_thread::Int) where {F,N} - kernel = @metal launch=false f(mtlKernelContext(), args...) + + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, + nothing) + + # this might not be the final context, since we may tune the workgroupsize + ctx = KA.mkcontext(obj, ndrange, iterspace) + + kernel = @metal launch=false f(ctx(), args...) # The pipeline state automatically computes occupancy stats threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup) @@ -17,43 +23,6 @@ struct mtlKernelContext <: AbstractKernelContext end return (; threads=Int(threads), blocks=Int(blocks)) end -function GPUArrays.gpu_call(::mtlArrayBackend, f, args, threads::Int, groups::Int; - name::Union{String,Nothing}) - @metal threads groups name f(mtlKernelContext(), args...) -end - - -## on-device - -# indexing -GPUArrays.blockidx(ctx::mtlKernelContext) = threadgroup_position_in_grid_1d() -GPUArrays.blockdim(ctx::mtlKernelContext) = threads_per_threadgroup_1d() -GPUArrays.threadidx(ctx::mtlKernelContext) = thread_position_in_threadgroup_1d() -GPUArrays.griddim(ctx::mtlKernelContext) = threadgroups_per_grid_1d() -GPUArrays.global_index(ctx::mtlKernelContext) = thread_position_in_grid_1d() -GPUArrays.global_size(ctx::mtlKernelContext) = threads_per_grid_1d() - -# memory - -@inline function GPUArrays.LocalMemory(::mtlKernelContext, ::Type{T}, ::Val{dims}, ::Val{id} - ) where {T, dims, id} - ptr = emit_threadgroup_memory(T, Val(prod(dims))) - MtlDeviceArray(dims, ptr) -end - -# synchronization - -@inline GPUArrays.synchronize_threads(::mtlKernelContext) = - threadgroup_barrier(MemoryFlagThreadGroup) - - - -# -# Host abstractions -# - -GPUArrays.backend(::Type{<:MtlArray}) = mtlArrayBackend() - const GLOBAL_RNGs = Dict{MTLDevice,GPUArrays.RNG}() function GPUArrays.default_rng(::Type{<:MtlArray}) dev = device() From b1eb3ee4065b7d1fcbae7b590dec11fccdc237fb Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 11:53:23 +0200 Subject: [PATCH 2/6] mtlBackend -> MetalBackend --- src/Metal.jl | 2 +- src/gpuarrays.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Metal.jl b/src/Metal.jl index 08eba6039..78608cdc5 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -60,13 +60,13 @@ include("mapreduce.jl") include("accumulate.jl") include("indexing.jl") include("random.jl") -include("gpuarrays.jl") # KernelAbstractions include("MetalKernels.jl") import .MetalKernels: MetalBackend export MetalBackend +include("gpuarrays.jl") include("deprecated.jl") include("precompile.jl") diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index a9ac94ebd..a221ac388 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -14,7 +14,7 @@ import KernelAbstractions: Backend # this might not be the final context, since we may tune the workgroupsize ctx = KA.mkcontext(obj, ndrange, iterspace) - kernel = @metal launch=false f(ctx(), args...) + kernel = @metal launch=false f(ctx, args...) # The pipeline state automatically computes occupancy stats threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup) From cb7d9f8ce5ece6ccec60d553495479cbadd39ef2 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 23:50:55 +0200 Subject: [PATCH 3/6] mimicking CUDA --- src/gpuarrays.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index a221ac388..84725512b 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -5,16 +5,16 @@ GPUArrays.device(x::MtlArray) = x.dev import KernelAbstractions import KernelAbstractions: Backend -@inline function GPUArrays.launch_heuristic(::MetalBackend, f::F, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {F,N} +@inline function GPUArrays.launch_heuristic(::MetalBackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) - # this might not be the final context, since we may tune the workgroupsize ctx = KA.mkcontext(obj, ndrange, iterspace) - kernel = @metal launch=false f(ctx, args...) + kernel = @metal launch=false obj.f(ctx, args...) # The pipeline state automatically computes occupancy stats threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup) From 3de70b0c8908ef1e6879fcacb1a2b38d1eeedd24 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Tue, 23 Jul 2024 09:31:57 +0200 Subject: [PATCH 4/6] copying CUDA --- src/Metal.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Metal.jl b/src/Metal.jl index 78608cdc5..9cef4709c 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -14,6 +14,7 @@ using Artifacts using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS include("version.jl") +import KernelAbstractions as KA # core library include("../lib/mtl/MTL.jl") @@ -63,7 +64,7 @@ include("random.jl") # KernelAbstractions include("MetalKernels.jl") -import .MetalKernels: MetalBackend +import .MetalKernels: MetalBackend, KA.launch_config export MetalBackend include("gpuarrays.jl") From e8ffa59ac6a5ae00cb4374acc7d73dbfefbc61f0 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Thu, 25 Jul 2024 15:23:54 +0200 Subject: [PATCH 5/6] removing heuristic --- src/gpuarrays.jl | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 84725512b..37b8f852e 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -2,27 +2,6 @@ GPUArrays.device(x::MtlArray) = x.dev -import KernelAbstractions -import KernelAbstractions: Backend - -@inline function GPUArrays.launch_heuristic(::MetalBackend, obj::O, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {O,N} - - ndrange = ceil(Int, elements / elements_per_thread) - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, - nothing) - - ctx = KA.mkcontext(obj, ndrange, iterspace) - - kernel = @metal launch=false obj.f(ctx, args...) - - # The pipeline state automatically computes occupancy stats - threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup) - blocks = cld(elements, threads) - - return (; threads=Int(threads), blocks=Int(blocks)) -end - const GLOBAL_RNGs = Dict{MTLDevice,GPUArrays.RNG}() function GPUArrays.default_rng(::Type{<:MtlArray}) dev = device() From cfe7eacc9f784980ef3c72229feeb083a507ac1c Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 16 Sep 2024 14:06:47 +0200 Subject: [PATCH 6/6] Revert "removing heuristic" This reverts commit 9a7a84a9efc08df3932a16f2ad283c9dc04674d3. --- src/gpuarrays.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 37b8f852e..84725512b 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -2,6 +2,27 @@ GPUArrays.device(x::MtlArray) = x.dev +import KernelAbstractions +import KernelAbstractions: Backend + +@inline function GPUArrays.launch_heuristic(::MetalBackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} + + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, + nothing) + + ctx = KA.mkcontext(obj, ndrange, iterspace) + + kernel = @metal launch=false obj.f(ctx, args...) + + # The pipeline state automatically computes occupancy stats + threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup) + blocks = cld(elements, threads) + + return (; threads=Int(threads), blocks=Int(blocks)) +end + const GLOBAL_RNGs = Dict{MTLDevice,GPUArrays.RNG}() function GPUArrays.default_rng(::Type{<:MtlArray}) dev = device()