JuliaGPU · yolhan83 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -7,4 +7,5 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
@@ -2,6 +2,7 @@ import AcceleratedKernels as AK
 using KernelAbstractions
 using GPUArrays
 
+using Statistics
 using BenchmarkTools
 using BenchmarkPlots, StatsPlots, FileIO
 using StableRNGs

diff --git a/benchmark/statistics.jl b/benchmark/statistics.jl
@@ -0,0 +1,25 @@
+group = addgroup!(SUITE, "statistics")
+
+n1d = 1_000_000
+n3d = 100
+for T in [UInt32, Int64, Float32]
+    local _group = addgroup!(group, "mean $T")
+
+    local randrange = T == Float32 ? T : T(1):T(100)
+
+    _group["mean1d_statistics"] = @benchmarkable @sb(Statistics.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+    _group["mean1d_ak"] = @benchmarkable @sb(AK.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+
+
+    _group["meannd_statistics"] = @benchmarkable @sb(Statistics.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+    _group["meannd_ak"] = @benchmarkable @sb(AK.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+
+    local _group = addgroup!(group, "var $T")
+    _group["var1d_statistics"] = @benchmarkable @sb(Statistics.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+    _group["var1d_ak"] = @benchmarkable @sb(AK.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+
+
+    _group["varnd_statistics"] = @benchmarkable @sb(Statistics.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+    _group["varnd_ak"] = @benchmarkable @sb(AK.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+
+end
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
@@ -34,6 +34,7 @@ include("accumulate/accumulate.jl")
 include("searchsorted.jl")
 include("predicates.jl")
 include("arithmetics.jl")
+include("statistics/statistics.jl")
 
 
 end     # module AcceleratedKernels
diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
@@ -0,0 +1,96 @@
+"""
+    mean(
+        f, src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the mean of `src` along dimensions `dims` after applying `f`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function mean(
+    f::Function,src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    init = T<:Integer ? zero(Float32) : zero(T)
+    res = mapreduce(f,+,src,backend;
+            init=init,
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    if isnothing(dims)
+        return res./length(src)
+    else
+        return res./size(src,dims)
+    end
+end
+
+function mean(
+    src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    init = T<:Integer ? zero(Float32) : zero(T)
+    res = reduce(+,src,backend;
+            init=init,
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    if isnothing(dims)
+        return res./length(src)
+    else
+        return res./size(src,dims)
+    end
+end
diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl
@@ -0,0 +1,2 @@
+include("mean.jl")
+include("var.jl")
diff --git a/src/statistics/var.jl b/src/statistics/var.jl
@@ -0,0 +1,133 @@
+@inline function _chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real}
+    nA, mA, M2A = a
+    nB, mB, M2B = b
+    if nA == 0
+        return b
+    elseif nB == 0
+        return a
+    else
+        nAB  = nA + nB                          
+        δ    = mB - mA                          
+        invn = inv(T(nAB))                
+        mean = (nA*mA + nB*mB) * invn
+        cross = (δ*δ) * (nA*nB * invn)
+        return (nAB, mean, M2A + M2B + cross)
+    end
+end
+"""
+    var(
+        src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+        corrected ::Bool = true,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the varience of `src` along dimensions `dims`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function var(
+    src::AbstractArray{T,N}, backend::Backend=get_backend(src);
+    dims::Union{Nothing,Int}=nothing,
+    corrected::Bool=true,
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+    temp::Union{Nothing,AbstractArray}=nothing,  # ignored
+    switch_below::Int=0,
+) where {T<:Integer,N}
+
+    init   = (0, 0f0, 0f0)
+    mapper = x -> (1, Float32(x), 0f0)
+
+    stats = mapreduce(
+        mapper, _chan_merge, src, backend;
+        init=init, neutral=init,
+        dims=dims,
+        max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=nothing,
+        switch_below=switch_below,
+    )
+
+    if dims === nothing
+        n, _, M2 = stats
+        return M2 / Float32(n - ifelse(corrected , 1 , 0))
+    else
+        out = similar(stats, Float32)
+        AcceleratedKernels.map!(
+            s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))),
+            out, stats, backend;
+            max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
+        )
+        return out
+    end
+end
+
+
+function var(
+    src::AbstractArray{T,N}, backend::Backend=get_backend(src);
+    dims::Union{Nothing,Int}=nothing,
+    corrected::Bool=true,
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+    temp::Union{Nothing,AbstractArray}=nothing,  # ignored
+    switch_below::Int=0,
+) where {T<:AbstractFloat,N}
+
+    init   = (0, zero(T), zero(T))
+    mapper = x -> (1, x, zero(typeof(x)))
+
+    stats = mapreduce(
+        mapper, _chan_merge, src, backend;
+        init=init, neutral=init,
+        dims=dims,
+        max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=nothing,
+        switch_below=switch_below,
+    )
+
+    if dims === nothing
+        n, _, M2 = stats
+        return M2 / T(n - ifelse(corrected , 1 , 0))
+    else
+        out = similar(stats, T)
+        AcceleratedKernels.map!(
+            s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))),
+            out, stats, backend;
+            max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
+        )
+        return out
+    end
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -4,4 +4,5 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,6 @@
 import AcceleratedKernels as AK
 using KernelAbstractions
+using Statistics
 using Test
 using Random
 import Pkg
@@ -66,6 +67,7 @@ end
     Aqua.test_all(AK)
 end
 
+include("statistics.jl")
 include("partition.jl")
 include("looping.jl")
 include("map.jl")