Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmark/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
1 change: 1 addition & 0 deletions benchmark/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import AcceleratedKernels as AK
using KernelAbstractions
using GPUArrays

using Statistics
using BenchmarkTools
using BenchmarkPlots, StatsPlots, FileIO
using StableRNGs
Expand Down
25 changes: 25 additions & 0 deletions benchmark/statistics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
group = addgroup!(SUITE, "statistics")

n1d = 1_000_000
n3d = 100
for T in [UInt32, Int64, Float32]
local _group = addgroup!(group, "mean $T")

local randrange = T == Float32 ? T : T(1):T(100)

_group["mean1d_statistics"] = @benchmarkable @sb(Statistics.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
_group["mean1d_ak"] = @benchmarkable @sb(AK.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))


_group["meannd_statistics"] = @benchmarkable @sb(Statistics.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
_group["meannd_ak"] = @benchmarkable @sb(AK.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))

local _group = addgroup!(group, "var $T")
_group["var1d_statistics"] = @benchmarkable @sb(Statistics.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
_group["var1d_ak"] = @benchmarkable @sb(AK.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))


_group["varnd_statistics"] = @benchmarkable @sb(Statistics.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
_group["varnd_ak"] = @benchmarkable @sb(AK.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))

end
1 change: 1 addition & 0 deletions src/AcceleratedKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ include("accumulate/accumulate.jl")
include("searchsorted.jl")
include("predicates.jl")
include("arithmetics.jl")
include("statistics/statistics.jl")


end # module AcceleratedKernels
96 changes: 96 additions & 0 deletions src/statistics/mean.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
mean(
f, src::AbstractArray{T}, backend::Backend=get_backend(src);
dims::Union{Nothing, Int}=nothing,

# CPU settings
max_tasks::Int=Threads.nthreads(),
min_elems::Int=1,
prefer_threads::Bool=true,

# GPU settings
block_size::Int=256,
temp::Union{Nothing, AbstractArray}=nothing,
switch_below::Int=0,
) where {T<:Real}

Compute the mean of `src` along dimensions `dims` after applying `f`.
If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
if it is an integer type.
## CPU settings
Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
cases are scaling linearly with the number of threads.

## GPU settings
The `block_size` parameter controls the number of threads per block.

The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
(`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
zero, check against `Base.reduce` for CPU arrays for exact behavior.

The `switch_below` parameter controls the threshold below which the reduction is performed on the
CPU and is only used for 1D reductions (i.e. `dims=nothing`).
"""
function mean(
f::Function,src::AbstractArray{T},backend::Backend=get_backend(src);
dims::Union{Nothing, Int}=nothing,
# CPU settings - ignored here
max_tasks::Int = Threads.nthreads(),
min_elems::Int = 1,
prefer_threads::Bool=true,
# GPU settings
block_size::Int = 256,
temp::Union{Nothing, AbstractArray} = nothing,
switch_below::Int=0,
) where {T<:Real}
init = T<:Integer ? zero(Float32) : zero(T)
res = mapreduce(f,+,src,backend;
init=init,
dims=dims,
max_tasks=max_tasks,
min_elems=min_elems,
prefer_threads=prefer_threads,
block_size=block_size,
temp=temp,
switch_below=switch_below)
if isnothing(dims)
return res./length(src)
else
return res./size(src,dims)
end
end

function mean(
src::AbstractArray{T},backend::Backend=get_backend(src);
dims::Union{Nothing, Int}=nothing,
# CPU settings - ignored here
max_tasks::Int = Threads.nthreads(),
min_elems::Int = 1,
prefer_threads::Bool=true,

# GPU settings
block_size::Int = 256,
temp::Union{Nothing, AbstractArray} = nothing,
switch_below::Int=0,
) where {T<:Real}
init = T<:Integer ? zero(Float32) : zero(T)
res = reduce(+,src,backend;
init=init,
dims=dims,
max_tasks=max_tasks,
min_elems=min_elems,
prefer_threads=prefer_threads,
block_size=block_size,
temp=temp,
switch_below=switch_below)
if isnothing(dims)
return res./length(src)
else
return res./size(src,dims)
end
end
2 changes: 2 additions & 0 deletions src/statistics/statistics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
include("mean.jl")
include("var.jl")
133 changes: 133 additions & 0 deletions src/statistics/var.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
@inline function _chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real}
nA, mA, M2A = a
nB, mB, M2B = b
if nA == 0
return b
elseif nB == 0
return a
else
nAB = nA + nB
δ = mB - mA
invn = inv(T(nAB))
mean = (nA*mA + nB*mB) * invn
cross = (δ*δ) * (nA*nB * invn)
return (nAB, mean, M2A + M2B + cross)
end
end
"""
var(
src::AbstractArray{T}, backend::Backend=get_backend(src);
dims::Union{Nothing, Int}=nothing,
corrected ::Bool = true,

# CPU settings
max_tasks::Int=Threads.nthreads(),
min_elems::Int=1,
prefer_threads::Bool=true,

# GPU settings
block_size::Int=256,
temp::Union{Nothing, AbstractArray}=nothing,
switch_below::Int=0,
) where {T<:Real}

Compute the varience of `src` along dimensions `dims`.
If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
if it is an integer type.
## CPU settings
Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
cases are scaling linearly with the number of threads.

## GPU settings
The `block_size` parameter controls the number of threads per block.

The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
(`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
zero, check against `Base.reduce` for CPU arrays for exact behavior.

The `switch_below` parameter controls the threshold below which the reduction is performed on the
CPU and is only used for 1D reductions (i.e. `dims=nothing`).
"""
function var(
src::AbstractArray{T,N}, backend::Backend=get_backend(src);
dims::Union{Nothing,Int}=nothing,
corrected::Bool=true,
max_tasks::Int=Threads.nthreads(),
min_elems::Int=1,
prefer_threads::Bool=true,
block_size::Int=256,
temp::Union{Nothing,AbstractArray}=nothing, # ignored
switch_below::Int=0,
) where {T<:Integer,N}

init = (0, 0f0, 0f0)
mapper = x -> (1, Float32(x), 0f0)

stats = mapreduce(
mapper, _chan_merge, src, backend;
init=init, neutral=init,
dims=dims,
max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
block_size=block_size,
temp=nothing,
switch_below=switch_below,
)

if dims === nothing
n, _, M2 = stats
return M2 / Float32(n - ifelse(corrected , 1 , 0))
else
out = similar(stats, Float32)
AcceleratedKernels.map!(
s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))),
out, stats, backend;
max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
)
return out
end
end


function var(
src::AbstractArray{T,N}, backend::Backend=get_backend(src);
dims::Union{Nothing,Int}=nothing,
corrected::Bool=true,
max_tasks::Int=Threads.nthreads(),
min_elems::Int=1,
prefer_threads::Bool=true,
block_size::Int=256,
temp::Union{Nothing,AbstractArray}=nothing, # ignored
switch_below::Int=0,
) where {T<:AbstractFloat,N}

init = (0, zero(T), zero(T))
mapper = x -> (1, x, zero(typeof(x)))

stats = mapreduce(
mapper, _chan_merge, src, backend;
init=init, neutral=init,
dims=dims,
max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
block_size=block_size,
temp=nothing,
switch_below=switch_below,
)

if dims === nothing
n, _, M2 = stats
return M2 / T(n - ifelse(corrected , 1 , 0))
else
out = similar(stats, T)
AcceleratedKernels.map!(
s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))),
out, stats, backend;
max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
)
return out
end
end
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import AcceleratedKernels as AK
using KernelAbstractions
using Statistics
using Test
using Random
import Pkg
Expand Down Expand Up @@ -66,6 +67,7 @@ end
Aqua.test_all(AK)
end

include("statistics.jl")
include("partition.jl")
include("looping.jl")
include("map.jl")
Expand Down
Loading