diff --git a/benchmark/Project.toml b/benchmark/Project.toml index a441514..d889c9e 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -7,4 +7,5 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl index 93736bb..dbc7d20 100644 --- a/benchmark/runbenchmarks.jl +++ b/benchmark/runbenchmarks.jl @@ -2,6 +2,7 @@ import AcceleratedKernels as AK using KernelAbstractions using GPUArrays +using Statistics using BenchmarkTools using BenchmarkPlots, StatsPlots, FileIO using StableRNGs diff --git a/benchmark/statistics.jl b/benchmark/statistics.jl new file mode 100644 index 0000000..2bcc8e2 --- /dev/null +++ b/benchmark/statistics.jl @@ -0,0 +1,25 @@ +group = addgroup!(SUITE, "statistics") + +n1d = 1_000_000 +n3d = 100 +for T in [UInt32, Int64, Float32] + local _group = addgroup!(group, "mean $T") + + local randrange = T == Float32 ? T : T(1):T(100) + + _group["mean1d_statistics"] = @benchmarkable @sb(Statistics.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + _group["mean1d_ak"] = @benchmarkable @sb(AK.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + + + _group["meannd_statistics"] = @benchmarkable @sb(Statistics.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + _group["meannd_ak"] = @benchmarkable @sb(AK.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + + local _group = addgroup!(group, "var $T") + _group["var1d_statistics"] = @benchmarkable @sb(Statistics.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + _group["var1d_ak"] = @benchmarkable @sb(AK.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + + + _group["varnd_statistics"] = @benchmarkable @sb(Statistics.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + _group["varnd_ak"] = @benchmarkable @sb(AK.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + +end diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index d662c2a..37a30ea 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -34,6 +34,7 @@ include("accumulate/accumulate.jl") include("searchsorted.jl") include("predicates.jl") include("arithmetics.jl") +include("statistics/statistics.jl") end # module AcceleratedKernels diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl new file mode 100644 index 0000000..cae47c9 --- /dev/null +++ b/src/statistics/mean.jl @@ -0,0 +1,96 @@ +""" + mean( + f, src::AbstractArray{T}, backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + temp::Union{Nothing, AbstractArray}=nothing, + switch_below::Int=0, + ) where {T<:Real} + + Compute the mean of `src` along dimensions `dims` after applying `f`. + If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` + if it is an integer type. + ## CPU settings + Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional + arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other + cases are scaling linearly with the number of threads. + + ## GPU settings + The `block_size` parameter controls the number of threads per block. + + The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar + (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is + required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination + array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, + except for the reduced dimension which becomes 1; there are some corner cases when one dimension is + zero, check against `Base.reduce` for CPU arrays for exact behavior. + + The `switch_below` parameter controls the threshold below which the reduction is performed on the + CPU and is only used for 1D reductions (i.e. `dims=nothing`). +""" +function mean( + f::Function,src::AbstractArray{T},backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + # CPU settings - ignored here + max_tasks::Int = Threads.nthreads(), + min_elems::Int = 1, + prefer_threads::Bool=true, + # GPU settings + block_size::Int = 256, + temp::Union{Nothing, AbstractArray} = nothing, + switch_below::Int=0, +) where {T<:Real} + init = T<:Integer ? zero(Float32) : zero(T) + res = mapreduce(f,+,src,backend; + init=init, + dims=dims, + max_tasks=max_tasks, + min_elems=min_elems, + prefer_threads=prefer_threads, + block_size=block_size, + temp=temp, + switch_below=switch_below) + if isnothing(dims) + return res./length(src) + else + return res./size(src,dims) + end +end + +function mean( + src::AbstractArray{T},backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + # CPU settings - ignored here + max_tasks::Int = Threads.nthreads(), + min_elems::Int = 1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int = 256, + temp::Union{Nothing, AbstractArray} = nothing, + switch_below::Int=0, +) where {T<:Real} + init = T<:Integer ? zero(Float32) : zero(T) + res = reduce(+,src,backend; + init=init, + dims=dims, + max_tasks=max_tasks, + min_elems=min_elems, + prefer_threads=prefer_threads, + block_size=block_size, + temp=temp, + switch_below=switch_below) + if isnothing(dims) + return res./length(src) + else + return res./size(src,dims) + end +end diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl new file mode 100644 index 0000000..6f78f1b --- /dev/null +++ b/src/statistics/statistics.jl @@ -0,0 +1,2 @@ +include("mean.jl") +include("var.jl") diff --git a/src/statistics/var.jl b/src/statistics/var.jl new file mode 100644 index 0000000..f6d3a66 --- /dev/null +++ b/src/statistics/var.jl @@ -0,0 +1,133 @@ +@inline function _chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real} + nA, mA, M2A = a + nB, mB, M2B = b + if nA == 0 + return b + elseif nB == 0 + return a + else + nAB = nA + nB + δ = mB - mA + invn = inv(T(nAB)) + mean = (nA*mA + nB*mB) * invn + cross = (δ*δ) * (nA*nB * invn) + return (nAB, mean, M2A + M2B + cross) + end +end +""" + var( + src::AbstractArray{T}, backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + corrected ::Bool = true, + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + temp::Union{Nothing, AbstractArray}=nothing, + switch_below::Int=0, + ) where {T<:Real} + + Compute the varience of `src` along dimensions `dims`. + If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` + if it is an integer type. + ## CPU settings + Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional + arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other + cases are scaling linearly with the number of threads. + + ## GPU settings + The `block_size` parameter controls the number of threads per block. + + The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar + (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is + required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination + array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, + except for the reduced dimension which becomes 1; there are some corner cases when one dimension is + zero, check against `Base.reduce` for CPU arrays for exact behavior. + + The `switch_below` parameter controls the threshold below which the reduction is performed on the + CPU and is only used for 1D reductions (i.e. `dims=nothing`). +""" +function var( + src::AbstractArray{T,N}, backend::Backend=get_backend(src); + dims::Union{Nothing,Int}=nothing, + corrected::Bool=true, + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, + temp::Union{Nothing,AbstractArray}=nothing, # ignored + switch_below::Int=0, +) where {T<:Integer,N} + + init = (0, 0f0, 0f0) + mapper = x -> (1, Float32(x), 0f0) + + stats = mapreduce( + mapper, _chan_merge, src, backend; + init=init, neutral=init, + dims=dims, + max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads, + block_size=block_size, + temp=nothing, + switch_below=switch_below, + ) + + if dims === nothing + n, _, M2 = stats + return M2 / Float32(n - ifelse(corrected , 1 , 0)) + else + out = similar(stats, Float32) + AcceleratedKernels.map!( + s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))), + out, stats, backend; + max_tasks=max_tasks, min_elems=min_elems, block_size=block_size, + ) + return out + end +end + + +function var( + src::AbstractArray{T,N}, backend::Backend=get_backend(src); + dims::Union{Nothing,Int}=nothing, + corrected::Bool=true, + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, + temp::Union{Nothing,AbstractArray}=nothing, # ignored + switch_below::Int=0, +) where {T<:AbstractFloat,N} + + init = (0, zero(T), zero(T)) + mapper = x -> (1, x, zero(typeof(x))) + + stats = mapreduce( + mapper, _chan_merge, src, backend; + init=init, neutral=init, + dims=dims, + max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads, + block_size=block_size, + temp=nothing, + switch_below=switch_below, + ) + + if dims === nothing + n, _, M2 = stats + return M2 / T(n - ifelse(corrected , 1 , 0)) + else + out = similar(stats, T) + AcceleratedKernels.map!( + s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))), + out, stats, backend; + max_tasks=max_tasks, min_elems=min_elems, block_size=block_size, + ) + return out + end +end diff --git a/test/Project.toml b/test/Project.toml index d77e276..8ce8029 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,4 +4,5 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 716fd8e..af19fcc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ import AcceleratedKernels as AK using KernelAbstractions +using Statistics using Test using Random import Pkg @@ -66,6 +67,7 @@ end Aqua.test_all(AK) end +include("statistics.jl") include("partition.jl") include("looping.jl") include("map.jl") diff --git a/test/statistics.jl b/test/statistics.jl new file mode 100644 index 0000000..7bccd91 --- /dev/null +++ b/test/statistics.jl @@ -0,0 +1,228 @@ +@testset "mean" begin + Random.seed!(0) + + # Test all possible corner cases against Statistics.mean + for dims in 1:3 + for isize in 1:3 + for jsize in 1:3 + for ksize in 1:3 + sh = rand(Int32(1):Int32(100), isize, jsize, ksize) + s = array_from_host(sh) + d = AK.mean(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ mean(sh; dims) + @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32 + d = AK.mean(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ mean(sh; dims) + @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32 + end + end + end + end + + # Fuzzy correctness testing + for _ in 1:100 + for dims in 1:3 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(UInt32(1):UInt32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(Float32, n1, n2, n3) + v = array_from_host(vh) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:4 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + init = rand(1:100) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + # Test that undefined kwargs are not accepted + @test_throws MethodError AK.mean(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg) + + # Testing different settings + AK.mean( + x->x , + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=nothing, + block_size=64, + temp=nothing, + switch_below=50, + max_tasks=10, + min_elems=100, + ) + AK.mean( + x->x , + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=2, + block_size=64, + temp=array_from_host(zeros(Float32, 3, 1, 5)), + switch_below=50, + max_tasks=10, + min_elems=100, + ) + AK.mean( + x->x , + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=3, + block_size=64, + temp=array_from_host(zeros(Float32, 3, 4, 1)), + switch_below=50, + max_tasks=16, + min_elems=1000, + ) +end + +@testset "var" begin + Random.seed!(0) + + # Test all possible corner cases against Statistics.var + for dims in 1:3 + for isize in 2:3 + for jsize in 2:3 + for ksize in 2:3 + sh = rand(Int32(1):Int32(100), isize, jsize, ksize) + s = array_from_host(sh) + d = AK.var(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ var(sh; dims) + @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32 + end + end + end + end + + # Fuzzy correctness testing + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(UInt32(1):UInt32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(Float32, n1, n2, n3) + v = array_from_host(vh) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + init = rand(1:100) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + # Test that undefined kwargs are not accepted + @test_throws MethodError AK.var(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg) + + # Testing different settings + AK.var( + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=nothing, + corrected=true, + block_size=64, + temp=nothing, + switch_below=50, + max_tasks=10, + min_elems=100, + ) + AK.var( + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=2, + corrected=true, + block_size=64, + temp=nothing, + switch_below=50, + max_tasks=10, + min_elems=100, + ) + AK.var( + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=3, + corrected=false, + block_size=64, + temp = nothing, + switch_below=50, + max_tasks=16, + min_elems=1000, + ) +end