From c8b3fe360100cfa7f4fa9612127976120ceee3e0 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 10:21:22 +0200 Subject: [PATCH 01/23] Add Statistics in KA, (only mean and var implemented) --- src/AcceleratedKernels.jl | 2 +- src/statistics/cor.jl | 0 src/statistics/cov.jl | 0 src/statistics/mean.jl | 88 +++++++++++++++ src/statistics/median.jl | 1 + src/statistics/middle.jl | 1 + src/statistics/quantile.jl | 0 src/statistics/statistics.jl | 10 ++ src/statistics/std.jl | 0 src/statistics/stdm.jl | 0 src/statistics/var.jl | 151 +++++++++++++++++++++++++ src/statistics/varm.jl | 0 test/Project.toml | 1 + test/runtests.jl | 2 + test/statistics.jl | 211 +++++++++++++++++++++++++++++++++++ 15 files changed, 466 insertions(+), 1 deletion(-) create mode 100644 src/statistics/cor.jl create mode 100644 src/statistics/cov.jl create mode 100644 src/statistics/mean.jl create mode 100644 src/statistics/median.jl create mode 100644 src/statistics/middle.jl create mode 100644 src/statistics/quantile.jl create mode 100644 src/statistics/statistics.jl create mode 100644 src/statistics/std.jl create mode 100644 src/statistics/stdm.jl create mode 100644 src/statistics/var.jl create mode 100644 src/statistics/varm.jl create mode 100644 test/statistics.jl diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index d662c2a..ce48e1f 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -34,6 +34,6 @@ include("accumulate/accumulate.jl") include("searchsorted.jl") include("predicates.jl") include("arithmetics.jl") - +include("statistics/statistics.jl") end # module AcceleratedKernels diff --git a/src/statistics/cor.jl b/src/statistics/cor.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/statistics/cov.jl b/src/statistics/cov.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl new file mode 100644 index 0000000..97d03e3 --- /dev/null +++ b/src/statistics/mean.jl @@ -0,0 +1,88 @@ +""" + mean( + f, src::AbstractArray{T}, backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + temp::Union{Nothing, AbstractArray}=nothing, + switch_below::Int=0, + ) where {T<:Real} + + Compute the mean of `src` along dimensions `dims` after applying `f`. + If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64` + if it is an integer type. + ## CPU settings + Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional + arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other + cases are scaling linearly with the number of threads. + + ## GPU settings + The `block_size` parameter controls the number of threads per block. + + The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar + (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is + required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination + array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, + except for the reduced dimension which becomes 1; there are some corner cases when one dimension is + zero, check against `Base.reduce` for CPU arrays for exact behavior. + + The `switch_below` parameter controls the threshold below which the reduction is performed on the + CPU and is only used for 1D reductions (i.e. `dims=nothing`). +""" +function mean( + f::Function,src::AbstractArray{T},backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + # CPU settings - ignored here + max_tasks::Int = Threads.nthreads(), + min_elems::Int = 1, + prefer_threads::Bool=true, + # GPU settings + block_size::Int = 256, + temp::Union{Nothing, AbstractArray} = nothing, + switch_below::Int=0, +) where {T<:Real} + init = T<:Integer ? zero(Float64) : zero(T) + res = mapreduce(f,+,src,backend; + init=init, + dims=dims, + max_tasks=max_tasks, + min_elems=min_elems, + prefer_threads=prefer_threads, + block_size=block_size, + temp=temp, + switch_below=switch_below) + return res ./ size(src,dims) +end + +function mean( + src::AbstractArray{T},backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + # CPU settings - ignored here + max_tasks::Int = Threads.nthreads(), + min_elems::Int = 1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int = 256, + temp::Union{Nothing, AbstractArray} = nothing, + switch_below::Int=0, +) where {T<:Real} + init = T<:Integer ? zero(Float64) : zero(T) + res = reduce(+,src,backend; + init=init, + dims=dims, + max_tasks=max_tasks, + min_elems=min_elems, + prefer_threads=prefer_threads, + block_size=block_size, + temp=temp, + switch_below=switch_below) + return res./size(src,dims) +end \ No newline at end of file diff --git a/src/statistics/median.jl b/src/statistics/median.jl new file mode 100644 index 0000000..5dd411f --- /dev/null +++ b/src/statistics/median.jl @@ -0,0 +1 @@ +# need to do the middle kernel first \ No newline at end of file diff --git a/src/statistics/middle.jl b/src/statistics/middle.jl new file mode 100644 index 0000000..5486841 --- /dev/null +++ b/src/statistics/middle.jl @@ -0,0 +1 @@ +# need to sort multi dimensional arrays first \ No newline at end of file diff --git a/src/statistics/quantile.jl b/src/statistics/quantile.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl new file mode 100644 index 0000000..46d413d --- /dev/null +++ b/src/statistics/statistics.jl @@ -0,0 +1,10 @@ +include("median.jl") +include("mean.jl") +include("cor.jl") +include("cov.jl") +include("middle.jl") +include("quantile.jl") +include("std.jl") +include("stdm.jl") +include("var.jl") +include("varm.jl") \ No newline at end of file diff --git a/src/statistics/std.jl b/src/statistics/std.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/statistics/stdm.jl b/src/statistics/stdm.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/statistics/var.jl b/src/statistics/var.jl new file mode 100644 index 0000000..d9c10d0 --- /dev/null +++ b/src/statistics/var.jl @@ -0,0 +1,151 @@ +""" + var!( + src::AbstractArray{T}, backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + corrected ::Bool = true, + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + temp::Union{Nothing, AbstractArray}=nothing, + switch_below::Int=0, + ) where {T<:Real} + + Compute the varience of `src` along dimensions `dims`. Can change `src` when using non-integer types + If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64` + if it is an integer type. + ## CPU settings + Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional + arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other + cases are scaling linearly with the number of threads. + + ## GPU settings + The `block_size` parameter controls the number of threads per block. + + The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar + (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is + required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination + array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, + except for the reduced dimension which becomes 1; there are some corner cases when one dimension is + zero, check against `Base.reduce` for CPU arrays for exact behavior. + + The `switch_below` parameter controls the threshold below which the reduction is performed on the + CPU and is only used for 1D reductions (i.e. `dims=nothing`). +""" +function var!( + src::AbstractArray{T},backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + corrected ::Bool = true, + # CPU settings - ignored here + max_tasks::Int = Threads.nthreads(), + min_elems::Int = 1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int = 256, + temp::Union{Nothing, AbstractArray} = nothing, + switch_below::Int=0, +) where {T<:Real} + m = mean( + src,backend; + dims=dims, + # CPU settings - ignored here + max_tasks = max_tasks, + min_elems = min_elems, + prefer_threads = prefer_threads, + # GPU settings + block_size = block_size, + temp = temp, + switch_below = switch_below + ) + if T<:Integer + src = Float64.(src) + end + # use a special kernel ? what if more than 3 dims ? + for sl in eachslice(src,dims=dims) + sl .-= selectdim(m,dims,1) + end + res = mapreduce(x->x*x,+,src,backend; + init=zero(eltype(src)), + dims=dims, + max_tasks=max_tasks, + min_elems=min_elems, + prefer_threads=prefer_threads, + block_size=block_size, + temp=temp, + switch_below=switch_below) + res ./= (size(src,dims) - ifelse(corrected , 1 , 0)) + return res +end + +""" + var!( + src::AbstractArray{T}, backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + corrected ::Bool = true, + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads=prefer_threads, + + # GPU settings + block_size::Int=256, + temp::Union{Nothing, AbstractArray}=nothing, + switch_below::Int=0, + ) where {T<:Real} + + Compute the varience of `src` along dimensions `dims`. + If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64` + if it is an integer type. + ## CPU settings + Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional + arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other + cases are scaling linearly with the number of threads. + + ## GPU settings + The `block_size` parameter controls the number of threads per block. + + The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar + (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is + required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination + array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, + except for the reduced dimension which becomes 1; there are some corner cases when one dimension is + zero, check against `Base.reduce` for CPU arrays for exact behavior. + + The `switch_below` parameter controls the threshold below which the reduction is performed on the + CPU and is only used for 1D reductions (i.e. `dims=nothing`). +""" +function var( + src::AbstractArray{T},backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + corrected ::Bool = true, + # CPU settings - ignored here + max_tasks::Int = Threads.nthreads(), + min_elems::Int = 1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int = 256, + temp::Union{Nothing, AbstractArray} = nothing, + switch_below::Int=0, +) where {T<:Real} + return var!(copy(src),backend; + dims=dims, + corrected = corrected, + # CPU settings - ignored here + max_tasks = max_tasks, + min_elems = min_elems, + prefer_threads = prefer_threads, + # GPU settings + block_size = block_size, + temp = temp, + switch_below = switch_below + ) +end \ No newline at end of file diff --git a/src/statistics/varm.jl b/src/statistics/varm.jl new file mode 100644 index 0000000..e69de29 diff --git a/test/Project.toml b/test/Project.toml index d77e276..8ce8029 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,4 +4,5 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 716fd8e..3fa0ba6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ import AcceleratedKernels as AK using KernelAbstractions +using Statistics using Test using Random import Pkg @@ -74,3 +75,4 @@ include("reduce.jl") include("accumulate.jl") include("predicates.jl") include("binarysearch.jl") +include("statistics.jl") \ No newline at end of file diff --git a/test/statistics.jl b/test/statistics.jl new file mode 100644 index 0000000..5bd5e74 --- /dev/null +++ b/test/statistics.jl @@ -0,0 +1,211 @@ +@testset "mean" begin + Random.seed!(0) + + # Test all possible corner cases against Statistics.mean + for dims in 1:3 + for isize in 1:3 + for jsize in 1:3 + for ksize in 1:3 + sh = rand(Int32(1):Int32(100), isize, jsize, ksize) + s = array_from_host(sh) + d = AK.mean(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ mean(sh; dims) + @test eltype(dh) == eltype(mean(sh; dims)) + d = AK.mean(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ mean(sh; dims) + @test eltype(dh) == eltype(mean(sh; dims)) + end + end + end + end + + # Fuzzy correctness testing + for _ in 1:100 + for dims in 1:3 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(UInt32(1):UInt32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(Float32, n1, n2, n3) + v = array_from_host(vh) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:4 + n1 = rand(1:100) + n2 = rand(1:100) + n3 = rand(1:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + init = rand(1:100) + s = AK.mean(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ mean(vh; dims) + end + end + + # Test that undefined kwargs are not accepted + @test_throws MethodError AK.mean(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg) + + # Testing different settings + AK.mean( + x->x , + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=2, + block_size=64, + temp=array_from_host(zeros(Float64, 3, 1, 5)), + switch_below=50, + max_tasks=10, + min_elems=100, + ) + AK.mean( + x->x , + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=3, + block_size=64, + temp=array_from_host(zeros(Float64, 3, 4, 1)), + switch_below=50, + max_tasks=16, + min_elems=1000, + ) +end + +@testset "var" begin + Random.seed!(0) + + # Test all possible corner cases against Statistics.var + for dims in 1:3 + for isize in 2:3 + for jsize in 2:3 + for ksize in 2:3 + sh = rand(Int32(1):Int32(100), isize, jsize, ksize) + s = array_from_host(sh) + d = AK.var(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ var(sh; dims) + @test eltype(dh) == eltype(var(sh; dims)) + tv = var(sh; dims) + d = AK.var!(s; prefer_threads, dims) + dh = Array(d) + @test dh ≈ tv + @test eltype(dh) == eltype(var(sh; dims)) + end + end + end + end + + # Fuzzy correctness testing + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(UInt32(1):UInt32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(Float32, n1, n2, n3) + v = array_from_host(vh) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + + for _ in 1:100 + for dims in 1:3 + n1 = rand(2:100) + n2 = rand(2:100) + n3 = rand(2:100) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + init = rand(1:100) + s = AK.var(v; prefer_threads, dims) + sh = Array(s) + @test sh ≈ var(vh; dims) + end + end + + # Test that undefined kwargs are not accepted + @test_throws MethodError AK.var(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg) + + # Testing different settings + AK.var( + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=2, + corrected=true, + block_size=64, + temp=array_from_host(zeros(Float64, 3, 1, 5)), + switch_below=50, + max_tasks=10, + min_elems=100, + ) + AK.var( + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=3, + corrected=false, + block_size=64, + temp=array_from_host(zeros(Float64, 3, 4, 1)), + switch_below=50, + max_tasks=16, + min_elems=1000, + ) +end \ No newline at end of file From b0a89546b894bb0b670200433666af797b15ddf9 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 10:45:15 +0200 Subject: [PATCH 02/23] make Float32 the type integers promote to --- src/statistics/mean.jl | 4 ++-- src/statistics/var.jl | 2 +- test/runtests.jl | 2 +- test/statistics.jl | 16 ++++++++-------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl index 97d03e3..2f6f108 100644 --- a/src/statistics/mean.jl +++ b/src/statistics/mean.jl @@ -48,7 +48,7 @@ function mean( temp::Union{Nothing, AbstractArray} = nothing, switch_below::Int=0, ) where {T<:Real} - init = T<:Integer ? zero(Float64) : zero(T) + init = T<:Integer ? zero(Float32) : zero(T) res = mapreduce(f,+,src,backend; init=init, dims=dims, @@ -74,7 +74,7 @@ function mean( temp::Union{Nothing, AbstractArray} = nothing, switch_below::Int=0, ) where {T<:Real} - init = T<:Integer ? zero(Float64) : zero(T) + init = T<:Integer ? zero(Float32) : zero(T) res = reduce(+,src,backend; init=init, dims=dims, diff --git a/src/statistics/var.jl b/src/statistics/var.jl index d9c10d0..7de563b 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -64,7 +64,7 @@ function var!( switch_below = switch_below ) if T<:Integer - src = Float64.(src) + src = Float32.(src) end # use a special kernel ? what if more than 3 dims ? for sl in eachslice(src,dims=dims) diff --git a/test/runtests.jl b/test/runtests.jl index 3fa0ba6..6784f77 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,4 +75,4 @@ include("reduce.jl") include("accumulate.jl") include("predicates.jl") include("binarysearch.jl") -include("statistics.jl") \ No newline at end of file +include("statistics.jl") diff --git a/test/statistics.jl b/test/statistics.jl index 5bd5e74..2f44d04 100644 --- a/test/statistics.jl +++ b/test/statistics.jl @@ -11,11 +11,11 @@ d = AK.mean(s; prefer_threads, dims) dh = Array(d) @test dh ≈ mean(sh; dims) - @test eltype(dh) == eltype(mean(sh; dims)) + @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32 d = AK.mean(s; prefer_threads, dims) dh = Array(d) @test dh ≈ mean(sh; dims) - @test eltype(dh) == eltype(mean(sh; dims)) + @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32 end end end @@ -85,7 +85,7 @@ prefer_threads, dims=2, block_size=64, - temp=array_from_host(zeros(Float64, 3, 1, 5)), + temp=array_from_host(zeros(Float32, 3, 1, 5)), switch_below=50, max_tasks=10, min_elems=100, @@ -96,7 +96,7 @@ prefer_threads, dims=3, block_size=64, - temp=array_from_host(zeros(Float64, 3, 4, 1)), + temp=array_from_host(zeros(Float32, 3, 4, 1)), switch_below=50, max_tasks=16, min_elems=1000, @@ -116,12 +116,12 @@ end d = AK.var(s; prefer_threads, dims) dh = Array(d) @test dh ≈ var(sh; dims) - @test eltype(dh) == eltype(var(sh; dims)) + @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32 tv = var(sh; dims) d = AK.var!(s; prefer_threads, dims) dh = Array(d) @test dh ≈ tv - @test eltype(dh) == eltype(var(sh; dims)) + @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32 end end end @@ -192,7 +192,7 @@ end dims=2, corrected=true, block_size=64, - temp=array_from_host(zeros(Float64, 3, 1, 5)), + temp=array_from_host(zeros(Float32, 3, 1, 5)), switch_below=50, max_tasks=10, min_elems=100, @@ -203,7 +203,7 @@ end dims=3, corrected=false, block_size=64, - temp=array_from_host(zeros(Float64, 3, 4, 1)), + temp=array_from_host(zeros(Float32, 3, 4, 1)), switch_below=50, max_tasks=16, min_elems=1000, From 1af77d64d678eea595c20549caf20976fe778056 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:16:21 +0200 Subject: [PATCH 03/23] use a kernel to substract the mean (perf boost and test to fix Metal) --- src/statistics/var.jl | 44 +++++++++++++++++++++++++++++++++++++++---- test/runtests.jl | 4 ++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index 7de563b..39f36ff 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,3 +1,29 @@ +@kernel function slicing_mean1d!(src,m) + i = @index(Global, Linear) + src[i] -= m[i] +end +@kernel function slicing_mean2d!(src,m,dims) + @assert 1<=dims<=2 + i,j = @index(Global, NTuple) + if dims == 1 + src[i,j] -= m[i] + else + src[i,j] -= m[j] + end +end +@kernel function slicing_mean3d!(src,m,dims) + @assert 1<=dims<=3 + i,j,k = @index(Global, NTuple) + if dims == 1 + src[i,j,k] -= m[i] + elseif dims == 2 + src[i,j,k] -= m[j] + else + src[i,j,k] -= m[k] + end +end + + """ var!( src::AbstractArray{T}, backend::Backend=get_backend(src); @@ -38,7 +64,7 @@ CPU and is only used for 1D reductions (i.e. `dims=nothing`). """ function var!( - src::AbstractArray{T},backend::Backend=get_backend(src); + src::AbstractArray{T,N},backend::Backend=get_backend(src); dims::Union{Nothing, Int}=nothing, corrected ::Bool = true, # CPU settings - ignored here @@ -50,7 +76,7 @@ function var!( block_size::Int = 256, temp::Union{Nothing, AbstractArray} = nothing, switch_below::Int=0, -) where {T<:Real} +) where {T<:Real,N} m = mean( src,backend; dims=dims, @@ -67,8 +93,18 @@ function var!( src = Float32.(src) end # use a special kernel ? what if more than 3 dims ? - for sl in eachslice(src,dims=dims) - sl .-= selectdim(m,dims,1) + if backend isa GPU && N<=3 + if N == 1 + slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) + elseif N == 2 + slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src)) + elseif N == 3 + slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src)) + end + else + for sl in eachslice(src,dims=dims) + sl .-= selectdim(m,dims,1) + end end res = mapreduce(x->x*x,+,src,backend; init=zero(eltype(src)), diff --git a/test/runtests.jl b/test/runtests.jl index 6784f77..5bb7f17 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -67,6 +67,7 @@ end Aqua.test_all(AK) end +include("statistics.jl") include("partition.jl") include("looping.jl") include("map.jl") @@ -74,5 +75,4 @@ include("sort.jl") include("reduce.jl") include("accumulate.jl") include("predicates.jl") -include("binarysearch.jl") -include("statistics.jl") +include("binarysearch.jl") \ No newline at end of file From c897aa7de9cbb5b9758a7d85204e1faa97d8ccda Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:21:29 +0200 Subject: [PATCH 04/23] fix --- src/statistics/var.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index 39f36ff..bc53459 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,25 +1,25 @@ @kernel function slicing_mean1d!(src,m) i = @index(Global, Linear) - src[i] -= m[i] + src[i] -= m end @kernel function slicing_mean2d!(src,m,dims) @assert 1<=dims<=2 i,j = @index(Global, NTuple) if dims == 1 - src[i,j] -= m[i] + src[i,j] -= m[1,j] else - src[i,j] -= m[j] + src[i,j] -= m[i,1] end end @kernel function slicing_mean3d!(src,m,dims) @assert 1<=dims<=3 i,j,k = @index(Global, NTuple) if dims == 1 - src[i,j,k] -= m[i] + src[i,j,k] -= m[1,j,k] elseif dims == 2 - src[i,j,k] -= m[j] + src[i,j,k] -= m[i,1,k] else - src[i,j,k] -= m[k] + src[i,j,k] -= m[i,j,1] end end From f31b2329a2676869176b6382625a5538ec9eb02d Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:29:34 +0200 Subject: [PATCH 05/23] make gpu friendly backends --- src/statistics/var.jl | 52 +++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index bc53459..c2f0a2b 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,25 +1,43 @@ -@kernel function slicing_mean1d!(src,m) - i = @index(Global, Linear) - src[i] -= m +@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean1d!(src,m) + N = @groupsize()[1] + iblock = @index(Group, Linear) + ithread = @index(Local, Linear) + i = ithread + (iblock - 0x1) * N + if i <= length(src) + src[i] -= m + end end -@kernel function slicing_mean2d!(src,m,dims) +@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean2d!(src,m,dims) @assert 1<=dims<=2 - i,j = @index(Global, NTuple) - if dims == 1 - src[i,j] -= m[1,j] - else - src[i,j] -= m[i,1] + Nx,Ny = @groupsize() + iblock,jblock = @index(Group, NTuple) + ithread,jthread = @index(Local, NTuple) + i = ithread + (iblock - 0x1) * Nx + j = jthread + (jblock - 0x1) * Ny + if i<=size(src,1) && j<=size(src,2) + if dims == 1 + src[i,j] -= m[1,j] + else + src[i,j] -= m[i,1] + end end end -@kernel function slicing_mean3d!(src,m,dims) +@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean3d!(src,m,dims) @assert 1<=dims<=3 - i,j,k = @index(Global, NTuple) - if dims == 1 - src[i,j,k] -= m[1,j,k] - elseif dims == 2 - src[i,j,k] -= m[i,1,k] - else - src[i,j,k] -= m[i,j,1] + Nx,Ny,Nz = @groupsize() + iblock,jblock,kblock = @index(Group, NTuple) + ithread,jthread,kthread = @index(Local, NTuple) + i = ithread + (iblock - 0x1) * Nx + j = jthread + (jblock - 0x1) * Ny + k = kthread + (kblock - 0x1) * Nz + if i<=size(src,1) && j<=size(src,2) && k<=size(src,3) + if dims == 1 + src[i,j,k] -= m[1,j,k] + elseif dims == 2 + src[i,j,k] -= m[i,1,k] + else + src[i,j,k] -= m[i,j,1] + end end end From e6428f26df02737c88af5b69ec440aad81dd43cd Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:30:53 +0200 Subject: [PATCH 06/23] use _ convention for kernels --- src/statistics/var.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index c2f0a2b..0146eee 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,4 +1,4 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean1d!(src,m) +@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,m) N = @groupsize()[1] iblock = @index(Group, Linear) ithread = @index(Local, Linear) @@ -7,7 +7,7 @@ src[i] -= m end end -@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean2d!(src,m,dims) +@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean2d!(src,m,dims) @assert 1<=dims<=2 Nx,Ny = @groupsize() iblock,jblock = @index(Group, NTuple) @@ -22,7 +22,7 @@ end end end end -@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean3d!(src,m,dims) +@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean3d!(src,m,dims) @assert 1<=dims<=3 Nx,Ny,Nz = @groupsize() iblock,jblock,kblock = @index(Group, NTuple) From fd17ece928380bdebdbe41e036878861d9606a1c Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:31:45 +0200 Subject: [PATCH 07/23] f32 default on docs --- src/statistics/mean.jl | 2 +- src/statistics/var.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl index 2f6f108..6e3659d 100644 --- a/src/statistics/mean.jl +++ b/src/statistics/mean.jl @@ -16,7 +16,7 @@ Compute the mean of `src` along dimensions `dims` after applying `f`. If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that - dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64` + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` if it is an integer type. ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional diff --git a/src/statistics/var.jl b/src/statistics/var.jl index 0146eee..ab629a5 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -61,7 +61,7 @@ end Compute the varience of `src` along dimensions `dims`. Can change `src` when using non-integer types If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that - dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64` + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` if it is an integer type. ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional @@ -156,7 +156,7 @@ end Compute the varience of `src` along dimensions `dims`. If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that - dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64` + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` if it is an integer type. ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional From 3e57905b0d21a1edf797fcb9160af57af04309bf Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:33:19 +0200 Subject: [PATCH 08/23] fix --- src/statistics/var.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index ab629a5..cf43f1e 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -113,11 +113,11 @@ function var!( # use a special kernel ? what if more than 3 dims ? if backend isa GPU && N<=3 if N == 1 - slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) + _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) elseif N == 2 - slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src)) + _slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src)) elseif N == 3 - slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src)) + _slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src)) end else for sl in eachslice(src,dims=dims) From 072d55be5f9fb64918123313c34e2f1be1532f96 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 11:50:02 +0200 Subject: [PATCH 09/23] fix reducing multi-dim arrray to scalar --- src/statistics/mean.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl index 6e3659d..c004e95 100644 --- a/src/statistics/mean.jl +++ b/src/statistics/mean.jl @@ -58,7 +58,11 @@ function mean( block_size=block_size, temp=temp, switch_below=switch_below) - return res ./ size(src,dims) + if isnothing(dims) + return res./length(src) + else + return res./size(src,dims) + end end function mean( @@ -84,5 +88,9 @@ function mean( block_size=block_size, temp=temp, switch_below=switch_below) - return res./size(src,dims) + if isnothing(dims) + return res./length(src) + else + return res./size(src,dims) + end end \ No newline at end of file From f6977d6b26bae2f78845fb4d23a11e57570e43b1 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 12:23:39 +0200 Subject: [PATCH 10/23] add kenel for multi dim var >3 --- src/statistics/var.jl | 72 +++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index cf43f1e..e5007a4 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,4 +1,4 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,m) +@kernel inbounds=true unsafe_indices=true function _slicing_mean1d!(src,m) N = @groupsize()[1] iblock = @index(Group, Linear) ithread = @index(Local, Linear) @@ -7,41 +7,41 @@ src[i] -= m end end -@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean2d!(src,m,dims) - @assert 1<=dims<=2 - Nx,Ny = @groupsize() - iblock,jblock = @index(Group, NTuple) - ithread,jthread = @index(Local, NTuple) - i = ithread + (iblock - 0x1) * Nx - j = jthread + (jblock - 0x1) * Ny - if i<=size(src,1) && j<=size(src,2) - if dims == 1 - src[i,j] -= m[1,j] - else - src[i,j] -= m[i,1] - end - end -end -@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean3d!(src,m,dims) - @assert 1<=dims<=3 - Nx,Ny,Nz = @groupsize() - iblock,jblock,kblock = @index(Group, NTuple) - ithread,jthread,kthread = @index(Local, NTuple) - i = ithread + (iblock - 0x1) * Nx - j = jthread + (jblock - 0x1) * Ny - k = kthread + (kblock - 0x1) * Nz - if i<=size(src,1) && j<=size(src,2) && k<=size(src,3) - if dims == 1 - src[i,j,k] -= m[1,j,k] - elseif dims == 2 - src[i,j,k] -= m[i,1,k] - else - src[i,j,k] -= m[i,j,1] + +# N-D version: m is an array with size(m, dims) == 1 +@kernel inbounds=true unsafe_indices=true function _slicing_meannd!(src, m, dims) + @assert 1 <= dims <= ndims(src) + + # Shapes/strides + src_strides = strides(src) + m_strides = strides(m) + nd = length(src_strides) + + # Group/thread indices (zero-based, like your mapreduce kernel) + N = @groupsize()[1] + iblock = @index(Group, Linear) - 0x1 + ithread = @index(Local, Linear) - 0x1 + tid = ithread + iblock * N + + if tid < length(src) + # Reconstruct multi-index of src from linear tid using strides, + # and build the matching linear index into m by ignoring the reduced dim. + tmp = tid + midx0 = typeof(tid)(0) # zero-based linear index into m + + KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16 + idxi = tmp ÷ src_strides[i] # coordinate along dimension i (zero-based) + if i != dims + midx0 += idxi * m_strides[i] + end + tmp = tmp % src_strides[i] end + + # Subtract: src linear index is tid+1 (1-based), m uses midx0+1 with reduced axis fixed to 1 + src[tid + 0x1] -= m[midx0 + 0x1] end end - """ var!( src::AbstractArray{T}, backend::Backend=get_backend(src); @@ -111,13 +111,11 @@ function var!( src = Float32.(src) end # use a special kernel ? what if more than 3 dims ? - if backend isa GPU && N<=3 + if backend isa GPU if N == 1 _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) - elseif N == 2 - _slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src)) - elseif N == 3 - _slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src)) + else + _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src)) end else for sl in eachslice(src,dims=dims) From 59622e7a86af54c8e1971a6f67b43613dd54a7eb Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 12:44:50 +0200 Subject: [PATCH 11/23] fix reducing multi-dim arrray to scalar --- src/statistics/var.jl | 29 +++++++++++++++++------------ test/statistics.jl | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index e5007a4..9058820 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,4 +1,4 @@ -@kernel inbounds=true unsafe_indices=true function _slicing_mean1d!(src,m) +@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,@Const(m)) N = @groupsize()[1] iblock = @index(Group, Linear) ithread = @index(Local, Linear) @@ -9,9 +9,7 @@ end # N-D version: m is an array with size(m, dims) == 1 -@kernel inbounds=true unsafe_indices=true function _slicing_meannd!(src, m, dims) - @assert 1 <= dims <= ndims(src) - +@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims)) # Shapes/strides src_strides = strides(src) m_strides = strides(m) @@ -111,15 +109,19 @@ function var!( src = Float32.(src) end # use a special kernel ? what if more than 3 dims ? - if backend isa GPU - if N == 1 - _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) - else - _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src)) - end + if isnothing(dims) + src .-= m else - for sl in eachslice(src,dims=dims) - sl .-= selectdim(m,dims,1) + if backend isa GPU + if N == 1 + _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) + else + _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src)) + end + else + for sl in eachslice(src,dims=dims) + sl .-= selectdim(m,dims,1) + end end end res = mapreduce(x->x*x,+,src,backend; @@ -131,6 +133,9 @@ function var!( block_size=block_size, temp=temp, switch_below=switch_below) + if isnothing(dims) + return res ./ (length(src) - ifelse(corrected , 1 , 0)) + end res ./= (size(src,dims) - ifelse(corrected , 1 , 0)) return res end diff --git a/test/statistics.jl b/test/statistics.jl index 2f44d04..f0d9ccd 100644 --- a/test/statistics.jl +++ b/test/statistics.jl @@ -79,6 +79,17 @@ @test_throws MethodError AK.mean(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg) # Testing different settings + AK.mean( + x->x , + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=nothing, + block_size=64, + temp=nothing, + switch_below=50, + max_tasks=10, + min_elems=100, + ) AK.mean( x->x , array_from_host(rand(Int32, 3, 4, 5)); @@ -186,6 +197,17 @@ end @test_throws MethodError AK.var(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg) # Testing different settings + AK.var( + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, + dims=nothing, + corrected=true, + block_size=64, + temp=nothing, + switch_below=50, + max_tasks=10, + min_elems=100, + ) AK.var( array_from_host(rand(Int32, 3, 4, 5)); prefer_threads, From d7cc95d61c499e22820a45f28f7dcb550bd9b4f7 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 13:09:19 +0200 Subject: [PATCH 12/23] kernel like cpu --- src/statistics/var.jl | 44 +++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index 9058820..e2bba19 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -8,34 +8,26 @@ end end -# N-D version: m is an array with size(m, dims) == 1 + @kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims)) - # Shapes/strides src_strides = strides(src) m_strides = strides(m) nd = length(src_strides) - - # Group/thread indices (zero-based, like your mapreduce kernel) N = @groupsize()[1] iblock = @index(Group, Linear) - 0x1 ithread = @index(Local, Linear) - 0x1 tid = ithread + iblock * N if tid < length(src) - # Reconstruct multi-index of src from linear tid using strides, - # and build the matching linear index into m by ignoring the reduced dim. tmp = tid - midx0 = typeof(tid)(0) # zero-based linear index into m - + midx0 = typeof(tid)(0) KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16 - idxi = tmp ÷ src_strides[i] # coordinate along dimension i (zero-based) + idxi = tmp ÷ src_strides[i] if i != dims midx0 += idxi * m_strides[i] end tmp = tmp % src_strides[i] end - - # Subtract: src linear index is tid+1 (1-based), m uses midx0+1 with reduced axis fixed to 1 src[tid + 0x1] -= m[midx0 + 0x1] end end @@ -108,19 +100,38 @@ function var!( if T<:Integer src = Float32.(src) end - # use a special kernel ? what if more than 3 dims ? if isnothing(dims) src .-= m else - if backend isa GPU + if use_gpu_algorithm(backend, prefer_threads) if N == 1 _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) else _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src)) end else - for sl in eachslice(src,dims=dims) - sl .-= selectdim(m,dims,1) + if N ==1 + foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do i + src[i] -= m + end + else + src_strides = strides(src) + m_strides = strides(m) + nd = length(src_strides) + foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc + @inbounds begin + tmp = isrc - 1 + midx0 = 0 + KernelAbstractions.Extras.@unroll for i in nd:-1:1 + idxi = tmp ÷ src_strides[i] + if i != dims + midx0 += idxi * m_strides[i] + end + tmp = tmp % src_strides[i] + end + src[isrc] -= m[midx0 + 1] + end + end end end end @@ -134,7 +145,8 @@ function var!( temp=temp, switch_below=switch_below) if isnothing(dims) - return res ./ (length(src) - ifelse(corrected , 1 , 0)) + res /= (length(src) - ifelse(corrected , 1 , 0)) + return res end res ./= (size(src,dims) - ifelse(corrected , 1 , 0)) return res From 3d0148927aedd31fcd919cbe59edbb632a046fe7 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 13:28:46 +0200 Subject: [PATCH 13/23] tiny perf --- src/statistics/var.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index e2bba19..2aa249f 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -148,7 +148,8 @@ function var!( res /= (length(src) - ifelse(corrected , 1 , 0)) return res end - res ./= (size(src,dims) - ifelse(corrected , 1 , 0)) + s = 1/ (size(src,dims) - ifelse(corrected , 1 , 0)) + res .*= s return res end From b8ca7c6bdf1784ca2f789628283b7b6654c17b05 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 13:48:53 +0200 Subject: [PATCH 14/23] fix --- src/statistics/var.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index 2aa249f..ef279c3 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -148,7 +148,7 @@ function var!( res /= (length(src) - ifelse(corrected , 1 , 0)) return res end - s = 1/ (size(src,dims) - ifelse(corrected , 1 , 0)) + s = eltype(src)(1)/ (size(src,dims) - ifelse(corrected , 1 , 0)) res .*= s return res end From 009cfa9def992bacc2be471e298b7a53c72d3e27 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 19:42:48 +0200 Subject: [PATCH 15/23] add benchmarks --- benchmark/Project.toml | 1 + benchmark/benchmarkresults.json | 1 + benchmark/statistics.jl | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 benchmark/benchmarkresults.json create mode 100644 benchmark/statistics.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml index a441514..d889c9e 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -7,4 +7,5 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" diff --git a/benchmark/benchmarkresults.json b/benchmark/benchmarkresults.json new file mode 100644 index 0000000..355c227 --- /dev/null +++ b/benchmark/benchmarkresults.json @@ -0,0 +1 @@ +[{"Julia":"1.11.6","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"accumulate_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":395,"time":3.7371e6,"memory":13072,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.4264e6,"memory":2080,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.50375e6,"memory":2032,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":2.4822e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":466,"time":2.0876e6,"memory":15152,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.9522e6,"memory":2096,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.62405e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":1.4878e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":403,"time":1.4421e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":75,"time":2.8291e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":72,"time":1.87295e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=1":["TrialEstimate",{"allocs":69,"time":462000.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=2":["TrialEstimate",{"allocs":403,"time":1.60455e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=1":["TrialEstimate",{"allocs":72,"time":3.6004e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=2":["TrialEstimate",{"allocs":75,"time":6.25675e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":69,"time":442900.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"statistics":["BenchmarkGroup",{"data":{"var Int64":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":94,"time":1.76085e6,"memory":2720,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":762800.0,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":171,"time":1.6185e6,"memory":4464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":2.2675e6,"memory":15440,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var UInt32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":1.9149e6,"memory":2640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":1.726e6,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":172,"time":1.9587e6,"memory":4704,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":1.96035e6,"memory":15424,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Int64":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.5188e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":78,"time":1.1905e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.2023e6,"memory":5376,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.00685e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean UInt32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.71e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":79,"time":1.727e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.9029e6,"memory":5360,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.5734e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Float32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":367950.0,"memory":1616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":77,"time":635100.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":204,"time":651850.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":286650.0,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var Float32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":573500.0,"memory":2624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":353,"time":439950.0,"memory":11744,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":169,"time":705800.0,"memory":4416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":494,"time":576900.0,"memory":13552,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"map":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.2987e6,"memory":2448,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.35005e6,"memory":2512,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.6264e6,"memory":2464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.61765e6,"memory":2528,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":80,"time":309600.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":91,"time":325700.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_2x":["TrialEstimate",{"allocs":91,"time":167500.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":80,"time":300500.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"accumulate_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.4999e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.6487e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.813e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":723700.0,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":417400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sincos":["TrialEstimate",{"allocs":375,"time":489500.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sincos":["TrialEstimate",{"allocs":195,"time":358400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":455700.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.5352e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.67895e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.8073e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":1.68725e6,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":268700.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":419200.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sin":["TrialEstimate",{"allocs":204,"time":656300.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sin":["TrialEstimate",{"allocs":77,"time":545000.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"sort":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2205,"time":6.4508e6,"memory":70432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":674,"time":1.2854e6,"memory":18352,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":673,"time":3.6601e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base":["TrialEstimate",{"allocs":2204,"time":6.7894e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":2204,"time":7.76615e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":673,"time":1.59125e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"NTuple{5, Int64}":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2204,"time":2.15671e7,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":683,"time":4.801e6,"memory":18576,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":713850.0,"memory":1792,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":2.29805e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":1.34505e6,"memory":656,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":1.54965e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":2.0549e6,"memory":1808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":1.7811e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":2.38805e6,"memory":672,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":2.4792e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_dims=1_sin":["TrialEstimate",{"allocs":66,"time":447800.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2":["TrialEstimate",{"allocs":68,"time":261200.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":72,"time":1.762e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":27,"time":335300.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2_sin":["TrialEstimate",{"allocs":68,"time":272900.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1_sin":["TrialEstimate",{"allocs":27,"time":374600.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2_sin":["TrialEstimate",{"allocs":72,"time":2.5064e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":66,"time":307050.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}]},"tags":[]}]]] \ No newline at end of file diff --git a/benchmark/statistics.jl b/benchmark/statistics.jl new file mode 100644 index 0000000..2bcc8e2 --- /dev/null +++ b/benchmark/statistics.jl @@ -0,0 +1,25 @@ +group = addgroup!(SUITE, "statistics") + +n1d = 1_000_000 +n3d = 100 +for T in [UInt32, Int64, Float32] + local _group = addgroup!(group, "mean $T") + + local randrange = T == Float32 ? T : T(1):T(100) + + _group["mean1d_statistics"] = @benchmarkable @sb(Statistics.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + _group["mean1d_ak"] = @benchmarkable @sb(AK.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + + + _group["meannd_statistics"] = @benchmarkable @sb(Statistics.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + _group["meannd_ak"] = @benchmarkable @sb(AK.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + + local _group = addgroup!(group, "var $T") + _group["var1d_statistics"] = @benchmarkable @sb(Statistics.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + _group["var1d_ak"] = @benchmarkable @sb(AK.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d))) + + + _group["varnd_statistics"] = @benchmarkable @sb(Statistics.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + _group["varnd_ak"] = @benchmarkable @sb(AK.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d))) + +end From 35b532216cd18082a36cb20864d221d7ac9cbb48 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 21:42:37 +0200 Subject: [PATCH 16/23] little perf improvment --- benchmark/benchmarkresults.json | 1 - benchmark/runbenchmarks.jl | 1 + src/statistics/var.jl | 58 +++++++++++++++++---------------- 3 files changed, 31 insertions(+), 29 deletions(-) delete mode 100644 benchmark/benchmarkresults.json diff --git a/benchmark/benchmarkresults.json b/benchmark/benchmarkresults.json deleted file mode 100644 index 355c227..0000000 --- a/benchmark/benchmarkresults.json +++ /dev/null @@ -1 +0,0 @@ -[{"Julia":"1.11.6","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"accumulate_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":395,"time":3.7371e6,"memory":13072,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.4264e6,"memory":2080,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.50375e6,"memory":2032,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":2.4822e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":466,"time":2.0876e6,"memory":15152,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.9522e6,"memory":2096,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.62405e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":1.4878e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":403,"time":1.4421e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":75,"time":2.8291e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":72,"time":1.87295e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=1":["TrialEstimate",{"allocs":69,"time":462000.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=2":["TrialEstimate",{"allocs":403,"time":1.60455e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=1":["TrialEstimate",{"allocs":72,"time":3.6004e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=2":["TrialEstimate",{"allocs":75,"time":6.25675e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":69,"time":442900.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"statistics":["BenchmarkGroup",{"data":{"var Int64":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":94,"time":1.76085e6,"memory":2720,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":762800.0,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":171,"time":1.6185e6,"memory":4464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":2.2675e6,"memory":15440,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var UInt32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":1.9149e6,"memory":2640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":1.726e6,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":172,"time":1.9587e6,"memory":4704,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":1.96035e6,"memory":15424,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Int64":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.5188e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":78,"time":1.1905e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.2023e6,"memory":5376,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.00685e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean UInt32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.71e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":79,"time":1.727e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.9029e6,"memory":5360,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.5734e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Float32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":367950.0,"memory":1616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":77,"time":635100.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":204,"time":651850.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":286650.0,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var Float32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":573500.0,"memory":2624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":353,"time":439950.0,"memory":11744,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":169,"time":705800.0,"memory":4416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":494,"time":576900.0,"memory":13552,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"map":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.2987e6,"memory":2448,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.35005e6,"memory":2512,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.6264e6,"memory":2464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.61765e6,"memory":2528,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":80,"time":309600.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":91,"time":325700.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_2x":["TrialEstimate",{"allocs":91,"time":167500.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":80,"time":300500.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"accumulate_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.4999e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.6487e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.813e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":723700.0,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":417400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sincos":["TrialEstimate",{"allocs":375,"time":489500.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sincos":["TrialEstimate",{"allocs":195,"time":358400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":455700.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.5352e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.67895e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.8073e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":1.68725e6,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":268700.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":419200.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sin":["TrialEstimate",{"allocs":204,"time":656300.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sin":["TrialEstimate",{"allocs":77,"time":545000.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"sort":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2205,"time":6.4508e6,"memory":70432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":674,"time":1.2854e6,"memory":18352,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":673,"time":3.6601e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base":["TrialEstimate",{"allocs":2204,"time":6.7894e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":2204,"time":7.76615e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":673,"time":1.59125e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"NTuple{5, Int64}":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2204,"time":2.15671e7,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":683,"time":4.801e6,"memory":18576,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":713850.0,"memory":1792,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":2.29805e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":1.34505e6,"memory":656,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":1.54965e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":2.0549e6,"memory":1808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":1.7811e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":2.38805e6,"memory":672,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":2.4792e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_dims=1_sin":["TrialEstimate",{"allocs":66,"time":447800.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2":["TrialEstimate",{"allocs":68,"time":261200.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":72,"time":1.762e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":27,"time":335300.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2_sin":["TrialEstimate",{"allocs":68,"time":272900.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1_sin":["TrialEstimate",{"allocs":27,"time":374600.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2_sin":["TrialEstimate",{"allocs":72,"time":2.5064e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":66,"time":307050.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}]},"tags":[]}]]] \ No newline at end of file diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl index 93736bb..dbc7d20 100644 --- a/benchmark/runbenchmarks.jl +++ b/benchmark/runbenchmarks.jl @@ -2,6 +2,7 @@ import AcceleratedKernels as AK using KernelAbstractions using GPUArrays +using Statistics using BenchmarkTools using BenchmarkPlots, StatsPlots, FileIO using StableRNGs diff --git a/src/statistics/var.jl b/src/statistics/var.jl index ef279c3..ab6cd99 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -31,6 +31,23 @@ end src[tid + 0x1] -= m[midx0 + 0x1] end end +function _slicing_meannd_cpu!(src::AbstractArray{T,N}, m, dims::Int;max_tasks,min_elems) where {T<:Real, N} + src_strides ::NTuple{N,Int} = strides(src) + m_strides ::NTuple{N,Int} = strides(m) + nd = length(src_strides) + foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc + tmp = isrc - 1 + midx0 = 0 + KernelAbstractions.Extras.@unroll for i in nd:-1:1 + @inbounds idxi = tmp ÷ src_strides[i] + if i != dims + @inbounds midx0 += idxi * m_strides[i] + end + @inbounds tmp = tmp % src_strides[i] + end + @inbounds src[isrc] -= m[midx0 + 1] + end +end """ var!( @@ -105,9 +122,9 @@ function var!( else if use_gpu_algorithm(backend, prefer_threads) if N == 1 - _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src)) + _slicing_mean1d!(backend,block_size)(src,m;ndrange=length(src)) else - _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src)) + _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=length(src)) end else if N ==1 @@ -115,36 +132,21 @@ function var!( src[i] -= m end else - src_strides = strides(src) - m_strides = strides(m) - nd = length(src_strides) - foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc - @inbounds begin - tmp = isrc - 1 - midx0 = 0 - KernelAbstractions.Extras.@unroll for i in nd:-1:1 - idxi = tmp ÷ src_strides[i] - if i != dims - midx0 += idxi * m_strides[i] - end - tmp = tmp % src_strides[i] - end - src[isrc] -= m[midx0 + 1] - end - end + _slicing_meannd_cpu!(src,m,dims;max_tasks=max_tasks,min_elems=min_elems) end end end + ntmp = isnothing(dims) ? nothing : m res = mapreduce(x->x*x,+,src,backend; - init=zero(eltype(src)), - dims=dims, - max_tasks=max_tasks, - min_elems=min_elems, - prefer_threads=prefer_threads, - block_size=block_size, - temp=temp, - switch_below=switch_below) - if isnothing(dims) + init=zero(eltype(src)), + dims=dims, + max_tasks=max_tasks, + min_elems=min_elems, + prefer_threads=prefer_threads, + block_size=block_size, + temp=ntmp, + switch_below=switch_below) + if isnothing(dims) || N == 1 res /= (length(src) - ifelse(corrected , 1 , 0)) return res end From 038a86e5e5ab8073374efc7b033fa82c2bfdc321 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 23:33:56 +0200 Subject: [PATCH 17/23] rework var completly --- src/statistics/var.jl | 288 ++++++++++++------------------------------ test/statistics.jl | 9 +- 2 files changed, 83 insertions(+), 214 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index ab6cd99..1b1fbd5 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,223 +1,97 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,@Const(m)) - N = @groupsize()[1] - iblock = @index(Group, Linear) - ithread = @index(Local, Linear) - i = ithread + (iblock - 0x1) * N - if i <= length(src) - src[i] -= m - end -end - - -@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims)) - src_strides = strides(src) - m_strides = strides(m) - nd = length(src_strides) - N = @groupsize()[1] - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 - tid = ithread + iblock * N - - if tid < length(src) - tmp = tid - midx0 = typeof(tid)(0) - KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16 - idxi = tmp ÷ src_strides[i] - if i != dims - midx0 += idxi * m_strides[i] - end - tmp = tmp % src_strides[i] - end - src[tid + 0x1] -= m[midx0 + 0x1] - end -end -function _slicing_meannd_cpu!(src::AbstractArray{T,N}, m, dims::Int;max_tasks,min_elems) where {T<:Real, N} - src_strides ::NTuple{N,Int} = strides(src) - m_strides ::NTuple{N,Int} = strides(m) - nd = length(src_strides) - foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc - tmp = isrc - 1 - midx0 = 0 - KernelAbstractions.Extras.@unroll for i in nd:-1:1 - @inbounds idxi = tmp ÷ src_strides[i] - if i != dims - @inbounds midx0 += idxi * m_strides[i] - end - @inbounds tmp = tmp % src_strides[i] - end - @inbounds src[isrc] -= m[midx0 + 1] +@inline function chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real} + nA, mA, M2A = a + nB, mB, M2B = b + if nA == 0 + return b + elseif nB == 0 + return a + else + nAB = nA + nB + δ = mB - mA + invn = inv(T(nAB)) + mean = (nA*mA + nB*mB) * invn + cross = (δ*δ) * (nA*nB * invn) + return (nAB, mean, M2A + M2B + cross) end end -""" - var!( - src::AbstractArray{T}, backend::Backend=get_backend(src); - dims::Union{Nothing, Int}=nothing, - corrected ::Bool = true, - - # CPU settings - max_tasks::Int=Threads.nthreads(), - min_elems::Int=1, - prefer_threads::Bool=true, - - # GPU settings - block_size::Int=256, - temp::Union{Nothing, AbstractArray}=nothing, - switch_below::Int=0, - ) where {T<:Real} - - Compute the varience of `src` along dimensions `dims`. Can change `src` when using non-integer types - If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that - dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` - if it is an integer type. - ## CPU settings - Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional - arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other - cases are scaling linearly with the number of threads. - - ## GPU settings - The `block_size` parameter controls the number of threads per block. - - The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar - (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is - required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination - array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, - except for the reduced dimension which becomes 1; there are some corner cases when one dimension is - zero, check against `Base.reduce` for CPU arrays for exact behavior. - - The `switch_below` parameter controls the threshold below which the reduction is performed on the - CPU and is only used for 1D reductions (i.e. `dims=nothing`). -""" -function var!( - src::AbstractArray{T,N},backend::Backend=get_backend(src); - dims::Union{Nothing, Int}=nothing, - corrected ::Bool = true, - # CPU settings - ignored here - max_tasks::Int = Threads.nthreads(), - min_elems::Int = 1, +function var( + src::AbstractArray{T,N}, backend::Backend=get_backend(src); + dims::Union{Nothing,Int}=nothing, + corrected::Bool=true, + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, prefer_threads::Bool=true, - - # GPU settings - block_size::Int = 256, - temp::Union{Nothing, AbstractArray} = nothing, + block_size::Int=256, + temp::Union{Nothing,AbstractArray}=nothing, # ignored switch_below::Int=0, -) where {T<:Real,N} - m = mean( - src,backend; +) where {T<:Integer,N} + + init = (0, 0f0, 0f0) + mapper = x -> (1, Float32(x), 0f0) + + stats = mapreduce( + mapper, chan_merge, src, backend; + init=init, neutral=init, dims=dims, - # CPU settings - ignored here - max_tasks = max_tasks, - min_elems = min_elems, - prefer_threads = prefer_threads, - # GPU settings - block_size = block_size, - temp = temp, - switch_below = switch_below + max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads, + block_size=block_size, + temp=nothing, + switch_below=switch_below, ) - if T<:Integer - src = Float32.(src) - end - if isnothing(dims) - src .-= m + + if dims === nothing + n, _, M2 = stats + return M2 / Float32(n - ifelse(corrected , 1 , 0)) else - if use_gpu_algorithm(backend, prefer_threads) - if N == 1 - _slicing_mean1d!(backend,block_size)(src,m;ndrange=length(src)) - else - _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=length(src)) - end - else - if N ==1 - foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do i - src[i] -= m - end - else - _slicing_meannd_cpu!(src,m,dims;max_tasks=max_tasks,min_elems=min_elems) - end - end - end - ntmp = isnothing(dims) ? nothing : m - res = mapreduce(x->x*x,+,src,backend; - init=zero(eltype(src)), - dims=dims, - max_tasks=max_tasks, - min_elems=min_elems, - prefer_threads=prefer_threads, - block_size=block_size, - temp=ntmp, - switch_below=switch_below) - if isnothing(dims) || N == 1 - res /= (length(src) - ifelse(corrected , 1 , 0)) - return res + # stats is an array of (Int32, Float32, Float32); finalize into Float32 array + out = similar(stats, Float32) + AcceleratedKernels.map!( + s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))), + out, stats, backend; + max_tasks=max_tasks, min_elems=min_elems, block_size=block_size, + ) + return out end - s = eltype(src)(1)/ (size(src,dims) - ifelse(corrected , 1 , 0)) - res .*= s - return res end -""" - var!( - src::AbstractArray{T}, backend::Backend=get_backend(src); - dims::Union{Nothing, Int}=nothing, - corrected ::Bool = true, - - # CPU settings - max_tasks::Int=Threads.nthreads(), - min_elems::Int=1, - prefer_threads=prefer_threads, - - # GPU settings - block_size::Int=256, - temp::Union{Nothing, AbstractArray}=nothing, - switch_below::Int=0, - ) where {T<:Real} - - Compute the varience of `src` along dimensions `dims`. - If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that - dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` - if it is an integer type. - ## CPU settings - Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional - arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other - cases are scaling linearly with the number of threads. - - ## GPU settings - The `block_size` parameter controls the number of threads per block. - - The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar - (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is - required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination - array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, - except for the reduced dimension which becomes 1; there are some corner cases when one dimension is - zero, check against `Base.reduce` for CPU arrays for exact behavior. - The `switch_below` parameter controls the threshold below which the reduction is performed on the - CPU and is only used for 1D reductions (i.e. `dims=nothing`). -""" function var( - src::AbstractArray{T},backend::Backend=get_backend(src); - dims::Union{Nothing, Int}=nothing, - corrected ::Bool = true, - # CPU settings - ignored here - max_tasks::Int = Threads.nthreads(), - min_elems::Int = 1, + src::AbstractArray{T,N}, backend::Backend=get_backend(src); + dims::Union{Nothing,Int}=nothing, + corrected::Bool=true, + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, prefer_threads::Bool=true, - - # GPU settings - block_size::Int = 256, - temp::Union{Nothing, AbstractArray} = nothing, + block_size::Int=256, + temp::Union{Nothing,AbstractArray}=nothing, # ignored switch_below::Int=0, -) where {T<:Real} - return var!(copy(src),backend; +) where {T<:AbstractFloat,N} + + init = (0, zero(T), zero(T)) + mapper = x -> (1, x, zero(typeof(x))) + + stats = mapreduce( + mapper, chan_merge, src, backend; + init=init, neutral=init, dims=dims, - corrected = corrected, - # CPU settings - ignored here - max_tasks = max_tasks, - min_elems = min_elems, - prefer_threads = prefer_threads, - # GPU settings - block_size = block_size, - temp = temp, - switch_below = switch_below - ) -end \ No newline at end of file + max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads, + block_size=block_size, + temp=nothing, + switch_below=switch_below, + ) + + if dims === nothing + n, _, M2 = stats + return M2 / T(n - ifelse(corrected , 1 , 0)) + else + # stats is an array of (Int32, Float32, Float32); finalize into Float32 array + out = similar(stats, T) + AcceleratedKernels.map!( + s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))), + out, stats, backend; + max_tasks=max_tasks, min_elems=min_elems, block_size=block_size, + ) + return out + end +end diff --git a/test/statistics.jl b/test/statistics.jl index f0d9ccd..5ab3741 100644 --- a/test/statistics.jl +++ b/test/statistics.jl @@ -128,11 +128,6 @@ end dh = Array(d) @test dh ≈ var(sh; dims) @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32 - tv = var(sh; dims) - d = AK.var!(s; prefer_threads, dims) - dh = Array(d) - @test dh ≈ tv - @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32 end end end @@ -214,7 +209,7 @@ end dims=2, corrected=true, block_size=64, - temp=array_from_host(zeros(Float32, 3, 1, 5)), + temp=nothing, switch_below=50, max_tasks=10, min_elems=100, @@ -225,7 +220,7 @@ end dims=3, corrected=false, block_size=64, - temp=array_from_host(zeros(Float32, 3, 4, 1)), + temp = nothing, switch_below=50, max_tasks=16, min_elems=1000, From bb68e5199e19fa5bcda4c5a4fc133150ace056a2 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 23:51:36 +0200 Subject: [PATCH 18/23] add doc back --- src/statistics/var.jl | 46 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/src/statistics/var.jl b/src/statistics/var.jl index 1b1fbd5..f6d3a66 100644 --- a/src/statistics/var.jl +++ b/src/statistics/var.jl @@ -1,4 +1,4 @@ -@inline function chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real} +@inline function _chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real} nA, mA, M2A = a nB, mB, M2B = b if nA == 0 @@ -14,7 +14,45 @@ return (nAB, mean, M2A + M2B + cross) end end +""" + var( + src::AbstractArray{T}, backend::Backend=get_backend(src); + dims::Union{Nothing, Int}=nothing, + corrected ::Bool = true, + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + temp::Union{Nothing, AbstractArray}=nothing, + switch_below::Int=0, + ) where {T<:Real} + + Compute the varience of `src` along dimensions `dims`. + If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that + dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32` + if it is an integer type. + ## CPU settings + Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional + arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other + cases are scaling linearly with the number of threads. + + ## GPU settings + The `block_size` parameter controls the number of threads per block. + + The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar + (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is + required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination + array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, + except for the reduced dimension which becomes 1; there are some corner cases when one dimension is + zero, check against `Base.reduce` for CPU arrays for exact behavior. + + The `switch_below` parameter controls the threshold below which the reduction is performed on the + CPU and is only used for 1D reductions (i.e. `dims=nothing`). +""" function var( src::AbstractArray{T,N}, backend::Backend=get_backend(src); dims::Union{Nothing,Int}=nothing, @@ -31,7 +69,7 @@ function var( mapper = x -> (1, Float32(x), 0f0) stats = mapreduce( - mapper, chan_merge, src, backend; + mapper, _chan_merge, src, backend; init=init, neutral=init, dims=dims, max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads, @@ -44,7 +82,6 @@ function var( n, _, M2 = stats return M2 / Float32(n - ifelse(corrected , 1 , 0)) else - # stats is an array of (Int32, Float32, Float32); finalize into Float32 array out = similar(stats, Float32) AcceleratedKernels.map!( s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))), @@ -72,7 +109,7 @@ function var( mapper = x -> (1, x, zero(typeof(x))) stats = mapreduce( - mapper, chan_merge, src, backend; + mapper, _chan_merge, src, backend; init=init, neutral=init, dims=dims, max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads, @@ -85,7 +122,6 @@ function var( n, _, M2 = stats return M2 / T(n - ifelse(corrected , 1 , 0)) else - # stats is an array of (Int32, Float32, Float32); finalize into Float32 array out = similar(stats, T) AcceleratedKernels.map!( s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))), From face574d73c8c444c82dff690f9c348bac0069fc Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Tue, 19 Aug 2025 23:56:15 +0200 Subject: [PATCH 19/23] avoid rm new line --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5bb7f17..af19fcc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,4 +75,4 @@ include("sort.jl") include("reduce.jl") include("accumulate.jl") include("predicates.jl") -include("binarysearch.jl") \ No newline at end of file +include("binarysearch.jl") From f93a70873536dd7c3c5380c5fd967683b7fd9f45 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Wed, 20 Aug 2025 00:03:29 +0200 Subject: [PATCH 20/23] another new line --- src/AcceleratedKernels.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index ce48e1f..37a30ea 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -36,4 +36,5 @@ include("predicates.jl") include("arithmetics.jl") include("statistics/statistics.jl") + end # module AcceleratedKernels From 2e0967d09363431a3bd79e9b2a39aa2325a631cd Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Wed, 20 Aug 2025 09:14:41 +0200 Subject: [PATCH 21/23] other lines --- src/statistics/mean.jl | 2 +- src/statistics/median.jl | 2 +- src/statistics/middle.jl | 2 +- test/statistics.jl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl index c004e95..cae47c9 100644 --- a/src/statistics/mean.jl +++ b/src/statistics/mean.jl @@ -93,4 +93,4 @@ function mean( else return res./size(src,dims) end -end \ No newline at end of file +end diff --git a/src/statistics/median.jl b/src/statistics/median.jl index 5dd411f..b3a65aa 100644 --- a/src/statistics/median.jl +++ b/src/statistics/median.jl @@ -1 +1 @@ -# need to do the middle kernel first \ No newline at end of file +# need to do the middle kernel first diff --git a/src/statistics/middle.jl b/src/statistics/middle.jl index 5486841..38bee94 100644 --- a/src/statistics/middle.jl +++ b/src/statistics/middle.jl @@ -1 +1 @@ -# need to sort multi dimensional arrays first \ No newline at end of file +# need to sort multi dimensional arrays first diff --git a/test/statistics.jl b/test/statistics.jl index 5ab3741..7bccd91 100644 --- a/test/statistics.jl +++ b/test/statistics.jl @@ -225,4 +225,4 @@ end max_tasks=16, min_elems=1000, ) -end \ No newline at end of file +end From bc7869f62a0de4736db31d7ea2792d4cc3c0b69d Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Wed, 20 Aug 2025 09:15:55 +0200 Subject: [PATCH 22/23] other lines --- src/statistics/statistics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl index 46d413d..48c4e31 100644 --- a/src/statistics/statistics.jl +++ b/src/statistics/statistics.jl @@ -7,4 +7,4 @@ include("quantile.jl") include("std.jl") include("stdm.jl") include("var.jl") -include("varm.jl") \ No newline at end of file +include("varm.jl") From 1b77f18ca9452847552035355c4f2a1380273af8 Mon Sep 17 00:00:00 2001 From: yolhan83 Date: Wed, 20 Aug 2025 09:17:13 +0200 Subject: [PATCH 23/23] only keep mean and var for now --- src/statistics/cor.jl | 0 src/statistics/cov.jl | 0 src/statistics/median.jl | 1 - src/statistics/middle.jl | 1 - src/statistics/quantile.jl | 0 src/statistics/statistics.jl | 8 -------- src/statistics/std.jl | 0 src/statistics/stdm.jl | 0 src/statistics/varm.jl | 0 9 files changed, 10 deletions(-) delete mode 100644 src/statistics/cor.jl delete mode 100644 src/statistics/cov.jl delete mode 100644 src/statistics/median.jl delete mode 100644 src/statistics/middle.jl delete mode 100644 src/statistics/quantile.jl delete mode 100644 src/statistics/std.jl delete mode 100644 src/statistics/stdm.jl delete mode 100644 src/statistics/varm.jl diff --git a/src/statistics/cor.jl b/src/statistics/cor.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/statistics/cov.jl b/src/statistics/cov.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/statistics/median.jl b/src/statistics/median.jl deleted file mode 100644 index b3a65aa..0000000 --- a/src/statistics/median.jl +++ /dev/null @@ -1 +0,0 @@ -# need to do the middle kernel first diff --git a/src/statistics/middle.jl b/src/statistics/middle.jl deleted file mode 100644 index 38bee94..0000000 --- a/src/statistics/middle.jl +++ /dev/null @@ -1 +0,0 @@ -# need to sort multi dimensional arrays first diff --git a/src/statistics/quantile.jl b/src/statistics/quantile.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl index 48c4e31..6f78f1b 100644 --- a/src/statistics/statistics.jl +++ b/src/statistics/statistics.jl @@ -1,10 +1,2 @@ -include("median.jl") include("mean.jl") -include("cor.jl") -include("cov.jl") -include("middle.jl") -include("quantile.jl") -include("std.jl") -include("stdm.jl") include("var.jl") -include("varm.jl") diff --git a/src/statistics/std.jl b/src/statistics/std.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/statistics/stdm.jl b/src/statistics/stdm.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/statistics/varm.jl b/src/statistics/varm.jl deleted file mode 100644 index e69de29..0000000