From c8b3fe360100cfa7f4fa9612127976120ceee3e0 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 10:21:22 +0200
Subject: [PATCH 01/23] Add Statistics in KA, (only mean and var implemented)

---
 src/AcceleratedKernels.jl    |   2 +-
 src/statistics/cor.jl        |   0
 src/statistics/cov.jl        |   0
 src/statistics/mean.jl       |  88 +++++++++++++++
 src/statistics/median.jl     |   1 +
 src/statistics/middle.jl     |   1 +
 src/statistics/quantile.jl   |   0
 src/statistics/statistics.jl |  10 ++
 src/statistics/std.jl        |   0
 src/statistics/stdm.jl       |   0
 src/statistics/var.jl        | 151 +++++++++++++++++++++++++
 src/statistics/varm.jl       |   0
 test/Project.toml            |   1 +
 test/runtests.jl             |   2 +
 test/statistics.jl           | 211 +++++++++++++++++++++++++++++++++++
 15 files changed, 466 insertions(+), 1 deletion(-)
 create mode 100644 src/statistics/cor.jl
 create mode 100644 src/statistics/cov.jl
 create mode 100644 src/statistics/mean.jl
 create mode 100644 src/statistics/median.jl
 create mode 100644 src/statistics/middle.jl
 create mode 100644 src/statistics/quantile.jl
 create mode 100644 src/statistics/statistics.jl
 create mode 100644 src/statistics/std.jl
 create mode 100644 src/statistics/stdm.jl
 create mode 100644 src/statistics/var.jl
 create mode 100644 src/statistics/varm.jl
 create mode 100644 test/statistics.jl

diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index d662c2a..ce48e1f 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -34,6 +34,6 @@ include("accumulate/accumulate.jl")
 include("searchsorted.jl")
 include("predicates.jl")
 include("arithmetics.jl")
-
+include("statistics/statistics.jl")
 
 end     # module AcceleratedKernels
diff --git a/src/statistics/cor.jl b/src/statistics/cor.jl
new file mode 100644
index 0000000..e69de29
diff --git a/src/statistics/cov.jl b/src/statistics/cov.jl
new file mode 100644
index 0000000..e69de29
diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
new file mode 100644
index 0000000..97d03e3
--- /dev/null
+++ b/src/statistics/mean.jl
@@ -0,0 +1,88 @@
+"""
+    mean(
+        f, src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the mean of `src` along dimensions `dims` after applying `f`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function mean(
+    f::Function,src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    init = T<:Integer ? zero(Float64) : zero(T)
+    res = mapreduce(f,+,src,backend;
+            init=init,
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    return res ./ size(src,dims)
+end
+
+function mean(
+    src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    init = T<:Integer ? zero(Float64) : zero(T)
+    res = reduce(+,src,backend;
+            init=init,
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    return res./size(src,dims)
+end
\ No newline at end of file
diff --git a/src/statistics/median.jl b/src/statistics/median.jl
new file mode 100644
index 0000000..5dd411f
--- /dev/null
+++ b/src/statistics/median.jl
@@ -0,0 +1 @@
+# need to do the middle kernel first
\ No newline at end of file
diff --git a/src/statistics/middle.jl b/src/statistics/middle.jl
new file mode 100644
index 0000000..5486841
--- /dev/null
+++ b/src/statistics/middle.jl
@@ -0,0 +1 @@
+# need to sort multi dimensional arrays first
\ No newline at end of file
diff --git a/src/statistics/quantile.jl b/src/statistics/quantile.jl
new file mode 100644
index 0000000..e69de29
diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl
new file mode 100644
index 0000000..46d413d
--- /dev/null
+++ b/src/statistics/statistics.jl
@@ -0,0 +1,10 @@
+include("median.jl")
+include("mean.jl")
+include("cor.jl")
+include("cov.jl")
+include("middle.jl")
+include("quantile.jl")
+include("std.jl")
+include("stdm.jl")
+include("var.jl")
+include("varm.jl")
\ No newline at end of file
diff --git a/src/statistics/std.jl b/src/statistics/std.jl
new file mode 100644
index 0000000..e69de29
diff --git a/src/statistics/stdm.jl b/src/statistics/stdm.jl
new file mode 100644
index 0000000..e69de29
diff --git a/src/statistics/var.jl b/src/statistics/var.jl
new file mode 100644
index 0000000..d9c10d0
--- /dev/null
+++ b/src/statistics/var.jl
@@ -0,0 +1,151 @@
+"""
+    var!(
+        src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+        corrected ::Bool = true,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the varience of `src` along dimensions `dims`. Can change `src` when using non-integer types
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function var!(
+    src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    corrected ::Bool = true,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    m = mean(
+        src,backend;
+        dims=dims,
+        # CPU settings - ignored here
+        max_tasks = max_tasks,
+        min_elems = min_elems,
+        prefer_threads = prefer_threads,
+        # GPU settings
+        block_size = block_size,
+        temp = temp,
+        switch_below = switch_below
+    )
+    if T<:Integer
+        src = Float64.(src)
+    end
+    # use a special kernel ? what if more than 3 dims ?
+    for sl in eachslice(src,dims=dims)
+        sl .-= selectdim(m,dims,1)
+    end
+    res = mapreduce(x->x*x,+,src,backend;
+            init=zero(eltype(src)),
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    res ./= (size(src,dims) - ifelse(corrected , 1 , 0))
+    return res
+end
+
+"""
+    var!(
+        src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+        corrected ::Bool = true,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads=prefer_threads,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the varience of `src` along dimensions `dims`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function var(
+    src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    corrected ::Bool = true,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    return var!(copy(src),backend;
+        dims=dims,
+        corrected = corrected,
+        # CPU settings - ignored here
+        max_tasks = max_tasks,
+        min_elems = min_elems,
+        prefer_threads = prefer_threads,
+        # GPU settings
+        block_size = block_size,
+        temp = temp,
+        switch_below = switch_below
+    ) 
+end
\ No newline at end of file
diff --git a/src/statistics/varm.jl b/src/statistics/varm.jl
new file mode 100644
index 0000000..e69de29
diff --git a/test/Project.toml b/test/Project.toml
index d77e276..8ce8029 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -4,4 +4,5 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 716fd8e..3fa0ba6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,6 @@
 import AcceleratedKernels as AK
 using KernelAbstractions
+using Statistics
 using Test
 using Random
 import Pkg
@@ -74,3 +75,4 @@ include("reduce.jl")
 include("accumulate.jl")
 include("predicates.jl")
 include("binarysearch.jl")
+include("statistics.jl")
\ No newline at end of file
diff --git a/test/statistics.jl b/test/statistics.jl
new file mode 100644
index 0000000..5bd5e74
--- /dev/null
+++ b/test/statistics.jl
@@ -0,0 +1,211 @@
+@testset "mean" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Statistics.mean
+    for dims in 1:3
+        for isize in 1:3
+            for jsize in 1:3
+                for ksize in 1:3
+                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.mean(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ mean(sh;  dims)
+                    @test eltype(dh) == eltype(mean(sh; dims))
+                    d = AK.mean(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ mean(sh;  dims)
+                    @test eltype(dh) == eltype(mean(sh; dims))
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mean(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mean(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mean(v; prefer_threads,  dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:4
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(1:100)
+            s = AK.mean(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    # Test that undefined kwargs are not accepted
+    @test_throws MethodError AK.mean(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg)
+
+    # Testing different settings
+    AK.mean(
+        x->x ,
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=2,
+        block_size=64,
+        temp=array_from_host(zeros(Float64, 3, 1, 5)),
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.mean(
+        x->x ,
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=3,
+        block_size=64,
+        temp=array_from_host(zeros(Float64, 3, 4, 1)),
+        switch_below=50,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end
+
+@testset "var" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Statistics.var
+    for dims in 1:3
+        for isize in 2:3
+            for jsize in 2:3
+                for ksize in 2:3
+                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.var(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ var(sh;  dims)
+                    @test eltype(dh) == eltype(var(sh; dims))
+                    tv =  var(sh;  dims)
+                    d = AK.var!(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ tv
+                    @test eltype(dh) == eltype(var(sh; dims))
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.var(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.var(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.var(v; prefer_threads,  dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(1:100)
+            s = AK.var(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+    # Test that undefined kwargs are not accepted
+    @test_throws MethodError AK.var(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg)
+
+    # Testing different settings
+    AK.var(
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=2,
+        corrected=true,
+        block_size=64,
+        temp=array_from_host(zeros(Float64, 3, 1, 5)),
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.var(
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=3,
+        corrected=false,
+        block_size=64,
+        temp=array_from_host(zeros(Float64, 3, 4, 1)),
+        switch_below=50,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end
\ No newline at end of file

From b0a89546b894bb0b670200433666af797b15ddf9 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 10:45:15 +0200
Subject: [PATCH 02/23] make Float32 the type integers promote to

---
 src/statistics/mean.jl |  4 ++--
 src/statistics/var.jl  |  2 +-
 test/runtests.jl       |  2 +-
 test/statistics.jl     | 16 ++++++++--------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
index 97d03e3..2f6f108 100644
--- a/src/statistics/mean.jl
+++ b/src/statistics/mean.jl
@@ -48,7 +48,7 @@ function mean(
     temp::Union{Nothing, AbstractArray} = nothing,
     switch_below::Int=0,
 )  where {T<:Real}  
-    init = T<:Integer ? zero(Float64) : zero(T)
+    init = T<:Integer ? zero(Float32) : zero(T)
     res = mapreduce(f,+,src,backend;
             init=init,
             dims=dims,
@@ -74,7 +74,7 @@ function mean(
     temp::Union{Nothing, AbstractArray} = nothing,
     switch_below::Int=0,
 )  where {T<:Real}  
-    init = T<:Integer ? zero(Float64) : zero(T)
+    init = T<:Integer ? zero(Float32) : zero(T)
     res = reduce(+,src,backend;
             init=init,
             dims=dims,
diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index d9c10d0..7de563b 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -64,7 +64,7 @@ function var!(
         switch_below = switch_below
     )
     if T<:Integer
-        src = Float64.(src)
+        src = Float32.(src)
     end
     # use a special kernel ? what if more than 3 dims ?
     for sl in eachslice(src,dims=dims)
diff --git a/test/runtests.jl b/test/runtests.jl
index 3fa0ba6..6784f77 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -75,4 +75,4 @@ include("reduce.jl")
 include("accumulate.jl")
 include("predicates.jl")
 include("binarysearch.jl")
-include("statistics.jl")
\ No newline at end of file
+include("statistics.jl")
diff --git a/test/statistics.jl b/test/statistics.jl
index 5bd5e74..2f44d04 100644
--- a/test/statistics.jl
+++ b/test/statistics.jl
@@ -11,11 +11,11 @@
                     d = AK.mean(s; prefer_threads,  dims)
                     dh = Array(d)
                     @test dh ≈ mean(sh;  dims)
-                    @test eltype(dh) == eltype(mean(sh; dims))
+                    @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32
                     d = AK.mean(s; prefer_threads,  dims)
                     dh = Array(d)
                     @test dh ≈ mean(sh;  dims)
-                    @test eltype(dh) == eltype(mean(sh; dims))
+                    @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32
                 end
             end
         end
@@ -85,7 +85,7 @@
         prefer_threads,
         dims=2,
         block_size=64,
-        temp=array_from_host(zeros(Float64, 3, 1, 5)),
+        temp=array_from_host(zeros(Float32, 3, 1, 5)),
         switch_below=50,
         max_tasks=10,
         min_elems=100,
@@ -96,7 +96,7 @@
         prefer_threads,
         dims=3,
         block_size=64,
-        temp=array_from_host(zeros(Float64, 3, 4, 1)),
+        temp=array_from_host(zeros(Float32, 3, 4, 1)),
         switch_below=50,
         max_tasks=16,
         min_elems=1000,
@@ -116,12 +116,12 @@ end
                     d = AK.var(s; prefer_threads,  dims)
                     dh = Array(d)
                     @test dh ≈ var(sh;  dims)
-                    @test eltype(dh) == eltype(var(sh; dims))
+                    @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32
                     tv =  var(sh;  dims)
                     d = AK.var!(s; prefer_threads,  dims)
                     dh = Array(d)
                     @test dh ≈ tv
-                    @test eltype(dh) == eltype(var(sh; dims))
+                    @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32
                 end
             end
         end
@@ -192,7 +192,7 @@ end
         dims=2,
         corrected=true,
         block_size=64,
-        temp=array_from_host(zeros(Float64, 3, 1, 5)),
+        temp=array_from_host(zeros(Float32, 3, 1, 5)),
         switch_below=50,
         max_tasks=10,
         min_elems=100,
@@ -203,7 +203,7 @@ end
         dims=3,
         corrected=false,
         block_size=64,
-        temp=array_from_host(zeros(Float64, 3, 4, 1)),
+        temp=array_from_host(zeros(Float32, 3, 4, 1)),
         switch_below=50,
         max_tasks=16,
         min_elems=1000,

From 1af77d64d678eea595c20549caf20976fe778056 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:16:21 +0200
Subject: [PATCH 03/23] use a kernel to substract the mean (perf boost and test
 to fix Metal)

---
 src/statistics/var.jl | 44 +++++++++++++++++++++++++++++++++++++++----
 test/runtests.jl      |  4 ++--
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index 7de563b..39f36ff 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,3 +1,29 @@
+@kernel function slicing_mean1d!(src,m)
+    i = @index(Global, Linear)
+    src[i] -= m[i]
+end
+@kernel function slicing_mean2d!(src,m,dims)
+    @assert 1<=dims<=2
+    i,j = @index(Global, NTuple)
+    if dims == 1
+        src[i,j] -= m[i]
+    else
+        src[i,j] -= m[j]
+    end
+end
+@kernel function slicing_mean3d!(src,m,dims)
+    @assert 1<=dims<=3
+    i,j,k = @index(Global, NTuple)
+    if dims == 1
+        src[i,j,k] -= m[i]
+    elseif dims == 2
+        src[i,j,k] -= m[j]
+    else
+        src[i,j,k] -= m[k]
+    end
+end
+
+
 """
     var!(
         src::AbstractArray{T}, backend::Backend=get_backend(src);
@@ -38,7 +64,7 @@
     CPU and is only used for 1D reductions (i.e. `dims=nothing`).
 """
 function var!(
-    src::AbstractArray{T},backend::Backend=get_backend(src);
+    src::AbstractArray{T,N},backend::Backend=get_backend(src);
     dims::Union{Nothing, Int}=nothing,
     corrected ::Bool = true,
     # CPU settings - ignored here
@@ -50,7 +76,7 @@ function var!(
     block_size::Int = 256,
     temp::Union{Nothing, AbstractArray} = nothing,
     switch_below::Int=0,
-)  where {T<:Real}  
+)  where {T<:Real,N}  
     m = mean(
         src,backend;
         dims=dims,
@@ -67,8 +93,18 @@ function var!(
         src = Float32.(src)
     end
     # use a special kernel ? what if more than 3 dims ?
-    for sl in eachslice(src,dims=dims)
-        sl .-= selectdim(m,dims,1)
+    if backend isa GPU && N<=3
+        if N == 1
+            slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
+        elseif N == 2
+            slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src))
+        elseif N == 3
+            slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src))
+        end
+    else
+        for sl in eachslice(src,dims=dims)
+            sl .-= selectdim(m,dims,1)
+        end
     end
     res = mapreduce(x->x*x,+,src,backend;
             init=zero(eltype(src)),
diff --git a/test/runtests.jl b/test/runtests.jl
index 6784f77..5bb7f17 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -67,6 +67,7 @@ end
     Aqua.test_all(AK)
 end
 
+include("statistics.jl")
 include("partition.jl")
 include("looping.jl")
 include("map.jl")
@@ -74,5 +75,4 @@ include("sort.jl")
 include("reduce.jl")
 include("accumulate.jl")
 include("predicates.jl")
-include("binarysearch.jl")
-include("statistics.jl")
+include("binarysearch.jl")
\ No newline at end of file

From c897aa7de9cbb5b9758a7d85204e1faa97d8ccda Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:21:29 +0200
Subject: [PATCH 04/23] fix

---
 src/statistics/var.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index 39f36ff..bc53459 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,25 +1,25 @@
 @kernel function slicing_mean1d!(src,m)
     i = @index(Global, Linear)
-    src[i] -= m[i]
+    src[i] -= m
 end
 @kernel function slicing_mean2d!(src,m,dims)
     @assert 1<=dims<=2
     i,j = @index(Global, NTuple)
     if dims == 1
-        src[i,j] -= m[i]
+        src[i,j] -= m[1,j]
     else
-        src[i,j] -= m[j]
+        src[i,j] -= m[i,1]
     end
 end
 @kernel function slicing_mean3d!(src,m,dims)
     @assert 1<=dims<=3
     i,j,k = @index(Global, NTuple)
     if dims == 1
-        src[i,j,k] -= m[i]
+        src[i,j,k] -= m[1,j,k]
     elseif dims == 2
-        src[i,j,k] -= m[j]
+        src[i,j,k] -= m[i,1,k]
     else
-        src[i,j,k] -= m[k]
+        src[i,j,k] -= m[i,j,1]
     end
 end
 

From f31b2329a2676869176b6382625a5538ec9eb02d Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:29:34 +0200
Subject: [PATCH 05/23] make gpu friendly backends

---
 src/statistics/var.jl | 52 +++++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index bc53459..c2f0a2b 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,25 +1,43 @@
-@kernel function slicing_mean1d!(src,m)
-    i = @index(Global, Linear)
-    src[i] -= m
+@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean1d!(src,m)
+    N = @groupsize()[1]
+    iblock = @index(Group, Linear)
+    ithread = @index(Local, Linear)
+    i = ithread + (iblock - 0x1) * N
+    if i <= length(src)
+        src[i] -= m
+    end
 end
-@kernel function slicing_mean2d!(src,m,dims)
+@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean2d!(src,m,dims)
     @assert 1<=dims<=2
-    i,j = @index(Global, NTuple)
-    if dims == 1
-        src[i,j] -= m[1,j]
-    else
-        src[i,j] -= m[i,1]
+    Nx,Ny = @groupsize()
+    iblock,jblock = @index(Group, NTuple)
+    ithread,jthread = @index(Local, NTuple)
+    i = ithread + (iblock - 0x1) * Nx
+    j = jthread + (jblock - 0x1) * Ny
+    if i<=size(src,1) && j<=size(src,2)
+        if dims == 1
+            src[i,j] -= m[1,j]
+        else
+            src[i,j] -= m[i,1]
+        end
     end
 end
-@kernel function slicing_mean3d!(src,m,dims)
+@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean3d!(src,m,dims)
     @assert 1<=dims<=3
-    i,j,k = @index(Global, NTuple)
-    if dims == 1
-        src[i,j,k] -= m[1,j,k]
-    elseif dims == 2
-        src[i,j,k] -= m[i,1,k]
-    else
-        src[i,j,k] -= m[i,j,1]
+    Nx,Ny,Nz = @groupsize()
+    iblock,jblock,kblock = @index(Group, NTuple)
+    ithread,jthread,kthread = @index(Local, NTuple)
+    i = ithread + (iblock - 0x1) * Nx
+    j = jthread + (jblock - 0x1) * Ny
+    k = kthread + (kblock - 0x1) * Nz
+    if i<=size(src,1) && j<=size(src,2) && k<=size(src,3)
+        if dims == 1
+            src[i,j,k] -= m[1,j,k]
+        elseif dims == 2
+            src[i,j,k] -= m[i,1,k]
+        else
+            src[i,j,k] -= m[i,j,1]
+        end
     end
 end
 

From e6428f26df02737c88af5b69ec440aad81dd43cd Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:30:53 +0200
Subject: [PATCH 06/23] use _ convention for kernels

---
 src/statistics/var.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index c2f0a2b..0146eee 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,4 +1,4 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean1d!(src,m)
+@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,m)
     N = @groupsize()[1]
     iblock = @index(Group, Linear)
     ithread = @index(Local, Linear)
@@ -7,7 +7,7 @@
         src[i] -= m
     end
 end
-@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean2d!(src,m,dims)
+@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean2d!(src,m,dims)
     @assert 1<=dims<=2
     Nx,Ny = @groupsize()
     iblock,jblock = @index(Group, NTuple)
@@ -22,7 +22,7 @@ end
         end
     end
 end
-@kernel inbounds=true cpu=false unsafe_indices=true function slicing_mean3d!(src,m,dims)
+@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean3d!(src,m,dims)
     @assert 1<=dims<=3
     Nx,Ny,Nz = @groupsize()
     iblock,jblock,kblock = @index(Group, NTuple)

From fd17ece928380bdebdbe41e036878861d9606a1c Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:31:45 +0200
Subject: [PATCH 07/23] f32 default on docs

---
 src/statistics/mean.jl | 2 +-
 src/statistics/var.jl  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
index 2f6f108..6e3659d 100644
--- a/src/statistics/mean.jl
+++ b/src/statistics/mean.jl
@@ -16,7 +16,7 @@
 
     Compute the mean of `src` along dimensions `dims` after applying `f`.
     If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
-    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64`
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
     if it is an integer type.
     ## CPU settings
     Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index 0146eee..ab629a5 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -61,7 +61,7 @@ end
 
     Compute the varience of `src` along dimensions `dims`. Can change `src` when using non-integer types
     If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
-    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64`
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
     if it is an integer type.
     ## CPU settings
     Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
@@ -156,7 +156,7 @@ end
 
     Compute the varience of `src` along dimensions `dims`.
     If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
-    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float64`
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
     if it is an integer type.
     ## CPU settings
     Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional

From 3e57905b0d21a1edf797fcb9160af57af04309bf Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:33:19 +0200
Subject: [PATCH 08/23] fix

---
 src/statistics/var.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index ab629a5..cf43f1e 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -113,11 +113,11 @@ function var!(
     # use a special kernel ? what if more than 3 dims ?
     if backend isa GPU && N<=3
         if N == 1
-            slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
+            _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
         elseif N == 2
-            slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src))
+            _slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src))
         elseif N == 3
-            slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src))
+            _slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src))
         end
     else
         for sl in eachslice(src,dims=dims)

From 072d55be5f9fb64918123313c34e2f1be1532f96 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 11:50:02 +0200
Subject: [PATCH 09/23] fix reducing multi-dim arrray to scalar

---
 src/statistics/mean.jl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
index 6e3659d..c004e95 100644
--- a/src/statistics/mean.jl
+++ b/src/statistics/mean.jl
@@ -58,7 +58,11 @@ function mean(
             block_size=block_size,
             temp=temp,
             switch_below=switch_below)
-    return res ./ size(src,dims)
+    if isnothing(dims)
+        return res./length(src)
+    else
+        return res./size(src,dims)
+    end
 end
 
 function mean(
@@ -84,5 +88,9 @@ function mean(
             block_size=block_size,
             temp=temp,
             switch_below=switch_below)
-    return res./size(src,dims)
+    if isnothing(dims)
+        return res./length(src)
+    else
+        return res./size(src,dims)
+    end
 end
\ No newline at end of file

From f6977d6b26bae2f78845fb4d23a11e57570e43b1 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 12:23:39 +0200
Subject: [PATCH 10/23] add kenel for multi dim var >3

---
 src/statistics/var.jl | 72 +++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 37 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index cf43f1e..e5007a4 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,4 +1,4 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,m)
+@kernel inbounds=true unsafe_indices=true function _slicing_mean1d!(src,m)
     N = @groupsize()[1]
     iblock = @index(Group, Linear)
     ithread = @index(Local, Linear)
@@ -7,41 +7,41 @@
         src[i] -= m
     end
 end
-@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean2d!(src,m,dims)
-    @assert 1<=dims<=2
-    Nx,Ny = @groupsize()
-    iblock,jblock = @index(Group, NTuple)
-    ithread,jthread = @index(Local, NTuple)
-    i = ithread + (iblock - 0x1) * Nx
-    j = jthread + (jblock - 0x1) * Ny
-    if i<=size(src,1) && j<=size(src,2)
-        if dims == 1
-            src[i,j] -= m[1,j]
-        else
-            src[i,j] -= m[i,1]
-        end
-    end
-end
-@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean3d!(src,m,dims)
-    @assert 1<=dims<=3
-    Nx,Ny,Nz = @groupsize()
-    iblock,jblock,kblock = @index(Group, NTuple)
-    ithread,jthread,kthread = @index(Local, NTuple)
-    i = ithread + (iblock - 0x1) * Nx
-    j = jthread + (jblock - 0x1) * Ny
-    k = kthread + (kblock - 0x1) * Nz
-    if i<=size(src,1) && j<=size(src,2) && k<=size(src,3)
-        if dims == 1
-            src[i,j,k] -= m[1,j,k]
-        elseif dims == 2
-            src[i,j,k] -= m[i,1,k]
-        else
-            src[i,j,k] -= m[i,j,1]
+
+# N-D version: m is an array with size(m, dims) == 1
+@kernel inbounds=true unsafe_indices=true function _slicing_meannd!(src, m, dims)
+    @assert 1 <= dims <= ndims(src)
+
+    # Shapes/strides
+    src_strides = strides(src)
+    m_strides   = strides(m)
+    nd          = length(src_strides)
+
+    # Group/thread indices (zero-based, like your mapreduce kernel)
+    N       = @groupsize()[1]
+    iblock  = @index(Group, Linear) - 0x1
+    ithread = @index(Local, Linear) - 0x1
+    tid     = ithread + iblock * N
+
+    if tid < length(src)
+        # Reconstruct multi-index of src from linear tid using strides,
+        # and build the matching linear index into m by ignoring the reduced dim.
+        tmp    = tid
+        midx0  = typeof(tid)(0)  # zero-based linear index into m
+
+        KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16
+            idxi = tmp ÷ src_strides[i]   # coordinate along dimension i (zero-based)
+            if i != dims
+                midx0 += idxi * m_strides[i]
+            end
+            tmp = tmp % src_strides[i]
         end
+
+        # Subtract: src linear index is tid+1 (1-based), m uses midx0+1 with reduced axis fixed to 1
+        src[tid + 0x1] -= m[midx0 + 0x1]
     end
 end
 
-
 """
     var!(
         src::AbstractArray{T}, backend::Backend=get_backend(src);
@@ -111,13 +111,11 @@ function var!(
         src = Float32.(src)
     end
     # use a special kernel ? what if more than 3 dims ?
-    if backend isa GPU && N<=3
+    if backend isa GPU
         if N == 1
             _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
-        elseif N == 2
-            _slicing_mean2d!(backend,block_size)(src,m,dims;ndrange=size(src))
-        elseif N == 3
-            _slicing_mean3d!(backend,block_size)(src,m,dims;ndrange=size(src))
+        else 
+            _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src))
         end
     else
         for sl in eachslice(src,dims=dims)

From 59622e7a86af54c8e1971a6f67b43613dd54a7eb Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 12:44:50 +0200
Subject: [PATCH 11/23] fix reducing multi-dim arrray to scalar

---
 src/statistics/var.jl | 29 +++++++++++++++++------------
 test/statistics.jl    | 22 ++++++++++++++++++++++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index e5007a4..9058820 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,4 +1,4 @@
-@kernel inbounds=true unsafe_indices=true function _slicing_mean1d!(src,m)
+@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,@Const(m))
     N = @groupsize()[1]
     iblock = @index(Group, Linear)
     ithread = @index(Local, Linear)
@@ -9,9 +9,7 @@
 end
 
 # N-D version: m is an array with size(m, dims) == 1
-@kernel inbounds=true unsafe_indices=true function _slicing_meannd!(src, m, dims)
-    @assert 1 <= dims <= ndims(src)
-
+@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims))
     # Shapes/strides
     src_strides = strides(src)
     m_strides   = strides(m)
@@ -111,15 +109,19 @@ function var!(
         src = Float32.(src)
     end
     # use a special kernel ? what if more than 3 dims ?
-    if backend isa GPU
-        if N == 1
-            _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
-        else 
-            _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src))
-        end
+    if isnothing(dims)
+        src .-= m
     else
-        for sl in eachslice(src,dims=dims)
-            sl .-= selectdim(m,dims,1)
+        if backend isa GPU
+            if N == 1
+                _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
+            else 
+                _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src))
+            end
+        else
+            for sl in eachslice(src,dims=dims)
+                sl .-= selectdim(m,dims,1)
+            end
         end
     end
     res = mapreduce(x->x*x,+,src,backend;
@@ -131,6 +133,9 @@ function var!(
             block_size=block_size,
             temp=temp,
             switch_below=switch_below)
+    if isnothing(dims)
+        return res ./ (length(src) - ifelse(corrected , 1 , 0))
+    end
     res ./= (size(src,dims) - ifelse(corrected , 1 , 0))
     return res
 end
diff --git a/test/statistics.jl b/test/statistics.jl
index 2f44d04..f0d9ccd 100644
--- a/test/statistics.jl
+++ b/test/statistics.jl
@@ -79,6 +79,17 @@
     @test_throws MethodError AK.mean(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg)
 
     # Testing different settings
+    AK.mean(
+        x->x ,
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=nothing,
+        block_size=64,
+        temp=nothing,
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
     AK.mean(
         x->x ,
         array_from_host(rand(Int32, 3, 4, 5));
@@ -186,6 +197,17 @@ end
     @test_throws MethodError AK.var(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg)
 
     # Testing different settings
+    AK.var(
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=nothing,
+        corrected=true,
+        block_size=64,
+        temp=nothing,
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
     AK.var(
         array_from_host(rand(Int32, 3, 4, 5));
         prefer_threads,

From d7cc95d61c499e22820a45f28f7dcb550bd9b4f7 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 13:09:19 +0200
Subject: [PATCH 12/23] kernel like cpu

---
 src/statistics/var.jl | 44 +++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index 9058820..e2bba19 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -8,34 +8,26 @@
     end
 end
 
-# N-D version: m is an array with size(m, dims) == 1
+
 @kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims))
-    # Shapes/strides
     src_strides = strides(src)
     m_strides   = strides(m)
     nd          = length(src_strides)
-
-    # Group/thread indices (zero-based, like your mapreduce kernel)
     N       = @groupsize()[1]
     iblock  = @index(Group, Linear) - 0x1
     ithread = @index(Local, Linear) - 0x1
     tid     = ithread + iblock * N
 
     if tid < length(src)
-        # Reconstruct multi-index of src from linear tid using strides,
-        # and build the matching linear index into m by ignoring the reduced dim.
         tmp    = tid
-        midx0  = typeof(tid)(0)  # zero-based linear index into m
-
+        midx0  = typeof(tid)(0)  
         KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16
-            idxi = tmp ÷ src_strides[i]   # coordinate along dimension i (zero-based)
+            idxi = tmp ÷ src_strides[i]   
             if i != dims
                 midx0 += idxi * m_strides[i]
             end
             tmp = tmp % src_strides[i]
         end
-
-        # Subtract: src linear index is tid+1 (1-based), m uses midx0+1 with reduced axis fixed to 1
         src[tid + 0x1] -= m[midx0 + 0x1]
     end
 end
@@ -108,19 +100,38 @@ function var!(
     if T<:Integer
         src = Float32.(src)
     end
-    # use a special kernel ? what if more than 3 dims ?
     if isnothing(dims)
         src .-= m
     else
-        if backend isa GPU
+        if use_gpu_algorithm(backend, prefer_threads)
             if N == 1
                 _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
             else 
                 _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src))
             end
         else
-            for sl in eachslice(src,dims=dims)
-                sl .-= selectdim(m,dims,1)
+            if N ==1
+                foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do i
+                    src[i] -= m
+                end
+            else
+                src_strides = strides(src)
+                m_strides   = strides(m)
+                nd          = length(src_strides)
+                foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc
+                    @inbounds begin
+                        tmp   = isrc - 1
+                        midx0 = 0
+                        KernelAbstractions.Extras.@unroll for i in nd:-1:1
+                            idxi = tmp ÷ src_strides[i]
+                            if i != dims
+                                midx0 += idxi * m_strides[i]
+                            end
+                            tmp = tmp % src_strides[i]
+                        end
+                        src[isrc] -= m[midx0 + 1]
+                    end
+                end
             end
         end
     end
@@ -134,7 +145,8 @@ function var!(
             temp=temp,
             switch_below=switch_below)
     if isnothing(dims)
-        return res ./ (length(src) - ifelse(corrected , 1 , 0))
+        res /= (length(src) - ifelse(corrected , 1 , 0))
+        return res 
     end
     res ./= (size(src,dims) - ifelse(corrected , 1 , 0))
     return res

From 3d0148927aedd31fcd919cbe59edbb632a046fe7 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 13:28:46 +0200
Subject: [PATCH 13/23] tiny perf

---
 src/statistics/var.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index e2bba19..2aa249f 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -148,7 +148,8 @@ function var!(
         res /= (length(src) - ifelse(corrected , 1 , 0))
         return res 
     end
-    res ./= (size(src,dims) - ifelse(corrected , 1 , 0))
+    s = 1/ (size(src,dims) - ifelse(corrected , 1 , 0))
+    res .*= s
     return res
 end
 

From b8ca7c6bdf1784ca2f789628283b7b6654c17b05 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 13:48:53 +0200
Subject: [PATCH 14/23] fix

---
 src/statistics/var.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index 2aa249f..ef279c3 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -148,7 +148,7 @@ function var!(
         res /= (length(src) - ifelse(corrected , 1 , 0))
         return res 
     end
-    s = 1/ (size(src,dims) - ifelse(corrected , 1 , 0))
+    s = eltype(src)(1)/ (size(src,dims) - ifelse(corrected , 1 , 0))
     res .*= s
     return res
 end

From 009cfa9def992bacc2be471e298b7a53c72d3e27 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 19:42:48 +0200
Subject: [PATCH 15/23] add benchmarks

---
 benchmark/Project.toml          |  1 +
 benchmark/benchmarkresults.json |  1 +
 benchmark/statistics.jl         | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+)
 create mode 100644 benchmark/benchmarkresults.json
 create mode 100644 benchmark/statistics.jl

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index a441514..d889c9e 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -7,4 +7,5 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
diff --git a/benchmark/benchmarkresults.json b/benchmark/benchmarkresults.json
new file mode 100644
index 0000000..355c227
--- /dev/null
+++ b/benchmark/benchmarkresults.json
@@ -0,0 +1 @@
+[{"Julia":"1.11.6","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"accumulate_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":395,"time":3.7371e6,"memory":13072,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.4264e6,"memory":2080,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.50375e6,"memory":2032,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":2.4822e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":466,"time":2.0876e6,"memory":15152,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.9522e6,"memory":2096,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.62405e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":1.4878e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":403,"time":1.4421e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":75,"time":2.8291e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":72,"time":1.87295e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=1":["TrialEstimate",{"allocs":69,"time":462000.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=2":["TrialEstimate",{"allocs":403,"time":1.60455e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=1":["TrialEstimate",{"allocs":72,"time":3.6004e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=2":["TrialEstimate",{"allocs":75,"time":6.25675e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":69,"time":442900.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"statistics":["BenchmarkGroup",{"data":{"var Int64":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":94,"time":1.76085e6,"memory":2720,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":762800.0,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":171,"time":1.6185e6,"memory":4464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":2.2675e6,"memory":15440,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var UInt32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":1.9149e6,"memory":2640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":1.726e6,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":172,"time":1.9587e6,"memory":4704,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":1.96035e6,"memory":15424,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Int64":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.5188e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":78,"time":1.1905e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.2023e6,"memory":5376,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.00685e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean UInt32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.71e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":79,"time":1.727e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.9029e6,"memory":5360,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.5734e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Float32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":367950.0,"memory":1616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":77,"time":635100.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":204,"time":651850.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":286650.0,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var Float32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":573500.0,"memory":2624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":353,"time":439950.0,"memory":11744,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":169,"time":705800.0,"memory":4416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":494,"time":576900.0,"memory":13552,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"map":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.2987e6,"memory":2448,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.35005e6,"memory":2512,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.6264e6,"memory":2464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.61765e6,"memory":2528,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":80,"time":309600.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":91,"time":325700.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_2x":["TrialEstimate",{"allocs":91,"time":167500.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":80,"time":300500.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"accumulate_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.4999e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.6487e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.813e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":723700.0,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":417400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sincos":["TrialEstimate",{"allocs":375,"time":489500.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sincos":["TrialEstimate",{"allocs":195,"time":358400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":455700.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.5352e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.67895e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.8073e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":1.68725e6,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":268700.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":419200.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sin":["TrialEstimate",{"allocs":204,"time":656300.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sin":["TrialEstimate",{"allocs":77,"time":545000.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"sort":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2205,"time":6.4508e6,"memory":70432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":674,"time":1.2854e6,"memory":18352,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":673,"time":3.6601e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base":["TrialEstimate",{"allocs":2204,"time":6.7894e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":2204,"time":7.76615e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":673,"time":1.59125e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"NTuple{5, Int64}":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2204,"time":2.15671e7,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":683,"time":4.801e6,"memory":18576,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":713850.0,"memory":1792,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":2.29805e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":1.34505e6,"memory":656,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":1.54965e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":2.0549e6,"memory":1808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":1.7811e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":2.38805e6,"memory":672,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":2.4792e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_dims=1_sin":["TrialEstimate",{"allocs":66,"time":447800.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2":["TrialEstimate",{"allocs":68,"time":261200.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":72,"time":1.762e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":27,"time":335300.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2_sin":["TrialEstimate",{"allocs":68,"time":272900.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1_sin":["TrialEstimate",{"allocs":27,"time":374600.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2_sin":["TrialEstimate",{"allocs":72,"time":2.5064e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":66,"time":307050.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}]},"tags":[]}]]]
\ No newline at end of file
diff --git a/benchmark/statistics.jl b/benchmark/statistics.jl
new file mode 100644
index 0000000..2bcc8e2
--- /dev/null
+++ b/benchmark/statistics.jl
@@ -0,0 +1,25 @@
+group = addgroup!(SUITE, "statistics")
+
+n1d = 1_000_000
+n3d = 100
+for T in [UInt32, Int64, Float32]
+    local _group = addgroup!(group, "mean $T")
+
+    local randrange = T == Float32 ? T : T(1):T(100)
+
+    _group["mean1d_statistics"] = @benchmarkable @sb(Statistics.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+    _group["mean1d_ak"] = @benchmarkable @sb(AK.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+
+    
+    _group["meannd_statistics"] = @benchmarkable @sb(Statistics.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+    _group["meannd_ak"] = @benchmarkable @sb(AK.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+
+    local _group = addgroup!(group, "var $T")
+    _group["var1d_statistics"] = @benchmarkable @sb(Statistics.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+    _group["var1d_ak"] = @benchmarkable @sb(AK.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+
+    
+    _group["varnd_statistics"] = @benchmarkable @sb(Statistics.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+    _group["varnd_ak"] = @benchmarkable @sb(AK.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+
+end

From 35b532216cd18082a36cb20864d221d7ac9cbb48 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 21:42:37 +0200
Subject: [PATCH 16/23] little perf improvment

---
 benchmark/benchmarkresults.json |  1 -
 benchmark/runbenchmarks.jl      |  1 +
 src/statistics/var.jl           | 58 +++++++++++++++++----------------
 3 files changed, 31 insertions(+), 29 deletions(-)
 delete mode 100644 benchmark/benchmarkresults.json

diff --git a/benchmark/benchmarkresults.json b/benchmark/benchmarkresults.json
deleted file mode 100644
index 355c227..0000000
--- a/benchmark/benchmarkresults.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"Julia":"1.11.6","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"accumulate_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":395,"time":3.7371e6,"memory":13072,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.4264e6,"memory":2080,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.50375e6,"memory":2032,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":2.4822e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":466,"time":2.0876e6,"memory":15152,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":64,"time":6.9522e6,"memory":2096,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":70,"time":3.62405e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":64,"time":1.4878e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":403,"time":1.4421e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":75,"time":2.8291e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":72,"time":1.87295e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=1":["TrialEstimate",{"allocs":69,"time":462000.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=2":["TrialEstimate",{"allocs":403,"time":1.60455e6,"memory":13200,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sincos_dims=1":["TrialEstimate",{"allocs":72,"time":3.6004e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_sincos_dims=2":["TrialEstimate",{"allocs":75,"time":6.25675e6,"memory":2256,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":69,"time":442900.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"statistics":["BenchmarkGroup",{"data":{"var Int64":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":94,"time":1.76085e6,"memory":2720,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":762800.0,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":171,"time":1.6185e6,"memory":4464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":2.2675e6,"memory":15440,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var UInt32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":1.9149e6,"memory":2640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":436,"time":1.726e6,"memory":14624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":172,"time":1.9587e6,"memory":4704,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":560,"time":1.96035e6,"memory":15424,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Int64":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.5188e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":78,"time":1.1905e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.2023e6,"memory":5376,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.00685e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean UInt32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":1.71e6,"memory":1632,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":79,"time":1.727e6,"memory":1952,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":188,"time":1.9029e6,"memory":5360,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":1.5734e6,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"mean Float32":["BenchmarkGroup",{"data":{"meannd_statistics":["TrialEstimate",{"allocs":49,"time":367950.0,"memory":1616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_statistics":["TrialEstimate",{"allocs":77,"time":635100.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"mean1d_ak":["TrialEstimate",{"allocs":204,"time":651850.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"meannd_ak":["TrialEstimate",{"allocs":151,"time":286650.0,"memory":5008,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"var Float32":["BenchmarkGroup",{"data":{"varnd_statistics":["TrialEstimate",{"allocs":92,"time":573500.0,"memory":2624,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"varnd_ak":["TrialEstimate",{"allocs":353,"time":439950.0,"memory":11744,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_statistics":["TrialEstimate",{"allocs":169,"time":705800.0,"memory":4416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"var1d_ak":["TrialEstimate",{"allocs":494,"time":576900.0,"memory":13552,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"map":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.2987e6,"memory":2448,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.35005e6,"memory":2512,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_2x":["TrialEstimate",{"allocs":92,"time":1.6264e6,"memory":2464,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":81,"time":1.61765e6,"memory":2528,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":80,"time":309600.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":91,"time":325700.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_2x":["TrialEstimate",{"allocs":91,"time":167500.0,"memory":2432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_2x":["TrialEstimate",{"allocs":80,"time":300500.0,"memory":2496,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"accumulate_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.4999e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.6487e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.813e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":723700.0,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":417400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sincos":["TrialEstimate",{"allocs":375,"time":489500.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sincos":["TrialEstimate",{"allocs":195,"time":358400.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":455700.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_1d":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.5352e6,"memory":5040,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":367,"time":1.67895e6,"memory":11520,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":172,"time":1.8073e6,"memory":5120,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":438,"time":1.68725e6,"memory":13600,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_1d":["TrialEstimate",{"allocs":195,"time":268700.0,"memory":5408,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d":["TrialEstimate",{"allocs":375,"time":419200.0,"memory":11648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_1d_sin":["TrialEstimate",{"allocs":204,"time":656300.0,"memory":5616,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_1d_sin":["TrialEstimate",{"allocs":77,"time":545000.0,"memory":1920,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"sort":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2205,"time":6.4508e6,"memory":70432,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":674,"time":1.2854e6,"memory":18352,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_sin":["TrialEstimate",{"allocs":673,"time":3.6601e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base":["TrialEstimate",{"allocs":2204,"time":6.7894e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_sin":["TrialEstimate",{"allocs":2204,"time":7.76615e6,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":673,"time":1.59125e6,"memory":18336,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"NTuple{5, Int64}":["BenchmarkGroup",{"data":{"base":["TrialEstimate",{"allocs":2204,"time":2.15671e7,"memory":70416,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck":["TrialEstimate",{"allocs":683,"time":4.801e6,"memory":18576,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}],"mapreduce_nd":["BenchmarkGroup",{"data":{"UInt32":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":713850.0,"memory":1792,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":2.29805e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":1.34505e6,"memory":656,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":1.54965e6,"memory":1888,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Int64":["BenchmarkGroup",{"data":{"base_dims=2":["TrialEstimate",{"allocs":69,"time":2.0549e6,"memory":1808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":61,"time":1.7811e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":28,"time":2.38805e6,"memory":672,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":61,"time":2.4792e6,"memory":1904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}],"Float32":["BenchmarkGroup",{"data":{"acck_dims=1_sin":["TrialEstimate",{"allocs":66,"time":447800.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2":["TrialEstimate",{"allocs":68,"time":261200.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2":["TrialEstimate",{"allocs":72,"time":1.762e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1":["TrialEstimate",{"allocs":27,"time":335300.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=2_sin":["TrialEstimate",{"allocs":68,"time":272900.0,"memory":1776,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"base_dims=1_sin":["TrialEstimate",{"allocs":27,"time":374600.0,"memory":640,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=2_sin":["TrialEstimate",{"allocs":72,"time":2.5064e6,"memory":2064,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}],"acck_dims=1":["TrialEstimate",{"allocs":66,"time":307050.0,"memory":1968,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"gctime":0.0}]},"tags":[]}]},"tags":[]}]},"tags":[]}]]]
\ No newline at end of file
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
index 93736bb..dbc7d20 100644
--- a/benchmark/runbenchmarks.jl
+++ b/benchmark/runbenchmarks.jl
@@ -2,6 +2,7 @@ import AcceleratedKernels as AK
 using KernelAbstractions
 using GPUArrays
 
+using Statistics
 using BenchmarkTools
 using BenchmarkPlots, StatsPlots, FileIO
 using StableRNGs
diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index ef279c3..ab6cd99 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -31,6 +31,23 @@ end
         src[tid + 0x1] -= m[midx0 + 0x1]
     end
 end
+function _slicing_meannd_cpu!(src::AbstractArray{T,N}, m, dims::Int;max_tasks,min_elems) where {T<:Real, N}
+    src_strides ::NTuple{N,Int} = strides(src)
+    m_strides  ::NTuple{N,Int} = strides(m)
+    nd          = length(src_strides)
+    foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc
+        tmp   = isrc - 1
+        midx0  = 0
+        KernelAbstractions.Extras.@unroll for i in nd:-1:1
+            @inbounds idxi = tmp ÷ src_strides[i]
+            if i != dims
+                @inbounds midx0 += idxi * m_strides[i]
+            end
+            @inbounds tmp = tmp % src_strides[i]
+        end
+        @inbounds src[isrc] -= m[midx0 + 1]
+    end
+end
 
 """
     var!(
@@ -105,9 +122,9 @@ function var!(
     else
         if use_gpu_algorithm(backend, prefer_threads)
             if N == 1
-                _slicing_mean1d!(backend,block_size)(src,m;ndrange=size(src))
+                _slicing_mean1d!(backend,block_size)(src,m;ndrange=length(src))
             else 
-                _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=size(src))
+                _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=length(src))
             end
         else
             if N ==1
@@ -115,36 +132,21 @@ function var!(
                     src[i] -= m
                 end
             else
-                src_strides = strides(src)
-                m_strides   = strides(m)
-                nd          = length(src_strides)
-                foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc
-                    @inbounds begin
-                        tmp   = isrc - 1
-                        midx0 = 0
-                        KernelAbstractions.Extras.@unroll for i in nd:-1:1
-                            idxi = tmp ÷ src_strides[i]
-                            if i != dims
-                                midx0 += idxi * m_strides[i]
-                            end
-                            tmp = tmp % src_strides[i]
-                        end
-                        src[isrc] -= m[midx0 + 1]
-                    end
-                end
+                _slicing_meannd_cpu!(src,m,dims;max_tasks=max_tasks,min_elems=min_elems)
             end
         end
     end
+    ntmp = isnothing(dims) ? nothing : m
     res = mapreduce(x->x*x,+,src,backend;
-            init=zero(eltype(src)),
-            dims=dims,
-            max_tasks=max_tasks,
-            min_elems=min_elems,
-            prefer_threads=prefer_threads,
-            block_size=block_size,
-            temp=temp,
-            switch_below=switch_below)
-    if isnothing(dims)
+        init=zero(eltype(src)),
+        dims=dims,
+        max_tasks=max_tasks,
+        min_elems=min_elems,
+        prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=ntmp,
+        switch_below=switch_below)
+    if isnothing(dims) || N == 1
         res /= (length(src) - ifelse(corrected , 1 , 0))
         return res 
     end

From 038a86e5e5ab8073374efc7b033fa82c2bfdc321 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 23:33:56 +0200
Subject: [PATCH 17/23] rework var completly

---
 src/statistics/var.jl | 288 ++++++++++++------------------------------
 test/statistics.jl    |   9 +-
 2 files changed, 83 insertions(+), 214 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index ab6cd99..1b1fbd5 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,223 +1,97 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_mean1d!(src,@Const(m))
-    N = @groupsize()[1]
-    iblock = @index(Group, Linear)
-    ithread = @index(Local, Linear)
-    i = ithread + (iblock - 0x1) * N
-    if i <= length(src)
-        src[i] -= m
-    end
-end
-
-
-@kernel inbounds=true cpu=false unsafe_indices=true function _slicing_meannd!(src, @Const(m), @Const(dims))
-    src_strides = strides(src)
-    m_strides   = strides(m)
-    nd          = length(src_strides)
-    N       = @groupsize()[1]
-    iblock  = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
-    tid     = ithread + iblock * N
-
-    if tid < length(src)
-        tmp    = tid
-        midx0  = typeof(tid)(0)  
-        KernelAbstractions.Extras.@unroll for i in nd:-1i16:1i16
-            idxi = tmp ÷ src_strides[i]   
-            if i != dims
-                midx0 += idxi * m_strides[i]
-            end
-            tmp = tmp % src_strides[i]
-        end
-        src[tid + 0x1] -= m[midx0 + 0x1]
-    end
-end
-function _slicing_meannd_cpu!(src::AbstractArray{T,N}, m, dims::Int;max_tasks,min_elems) where {T<:Real, N}
-    src_strides ::NTuple{N,Int} = strides(src)
-    m_strides  ::NTuple{N,Int} = strides(m)
-    nd          = length(src_strides)
-    foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do isrc
-        tmp   = isrc - 1
-        midx0  = 0
-        KernelAbstractions.Extras.@unroll for i in nd:-1:1
-            @inbounds idxi = tmp ÷ src_strides[i]
-            if i != dims
-                @inbounds midx0 += idxi * m_strides[i]
-            end
-            @inbounds tmp = tmp % src_strides[i]
-        end
-        @inbounds src[isrc] -= m[midx0 + 1]
+@inline function chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real}
+    nA, mA, M2A = a
+    nB, mB, M2B = b
+    if nA == 0
+        return b
+    elseif nB == 0
+        return a
+    else
+        nAB  = nA + nB                          
+        δ    = mB - mA                          
+        invn = inv(T(nAB))                
+        mean = (nA*mA + nB*mB) * invn
+        cross = (δ*δ) * (nA*nB * invn)
+        return (nAB, mean, M2A + M2B + cross)
     end
 end
 
-"""
-    var!(
-        src::AbstractArray{T}, backend::Backend=get_backend(src);
-        dims::Union{Nothing, Int}=nothing,
-        corrected ::Bool = true,
-
-        # CPU settings
-        max_tasks::Int=Threads.nthreads(),
-        min_elems::Int=1,
-        prefer_threads::Bool=true,
-
-        # GPU settings
-        block_size::Int=256,
-        temp::Union{Nothing, AbstractArray}=nothing,
-        switch_below::Int=0,
-    ) where {T<:Real}
-
-    Compute the varience of `src` along dimensions `dims`. Can change `src` when using non-integer types
-    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
-    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
-    if it is an integer type.
-    ## CPU settings
-    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
-    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
-    cases are scaling linearly with the number of threads.
-
-    ## GPU settings
-    The `block_size` parameter controls the number of threads per block.
-
-    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
-    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
-    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
-    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
-    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
-    zero, check against `Base.reduce` for CPU arrays for exact behavior.
-
-    The `switch_below` parameter controls the threshold below which the reduction is performed on the
-    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
-"""
-function var!(
-    src::AbstractArray{T,N},backend::Backend=get_backend(src);
-    dims::Union{Nothing, Int}=nothing,
-    corrected ::Bool = true,
-    # CPU settings - ignored here
-    max_tasks::Int = Threads.nthreads(),
-    min_elems::Int = 1,
+function var(
+    src::AbstractArray{T,N}, backend::Backend=get_backend(src);
+    dims::Union{Nothing,Int}=nothing,
+    corrected::Bool=true,
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
     prefer_threads::Bool=true,
-
-    # GPU settings
-    block_size::Int = 256,
-    temp::Union{Nothing, AbstractArray} = nothing,
+    block_size::Int=256,
+    temp::Union{Nothing,AbstractArray}=nothing,  # ignored
     switch_below::Int=0,
-)  where {T<:Real,N}  
-    m = mean(
-        src,backend;
+) where {T<:Integer,N}
+
+    init   = (0, 0f0, 0f0)
+    mapper = x -> (1, Float32(x), 0f0)
+
+    stats = mapreduce(
+        mapper, chan_merge, src, backend;
+        init=init, neutral=init,
         dims=dims,
-        # CPU settings - ignored here
-        max_tasks = max_tasks,
-        min_elems = min_elems,
-        prefer_threads = prefer_threads,
-        # GPU settings
-        block_size = block_size,
-        temp = temp,
-        switch_below = switch_below
+        max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=nothing,
+        switch_below=switch_below,
     )
-    if T<:Integer
-        src = Float32.(src)
-    end
-    if isnothing(dims)
-        src .-= m
+
+    if dims === nothing
+        n, _, M2 = stats
+        return M2 / Float32(n - ifelse(corrected , 1 , 0))
     else
-        if use_gpu_algorithm(backend, prefer_threads)
-            if N == 1
-                _slicing_mean1d!(backend,block_size)(src,m;ndrange=length(src))
-            else 
-                _slicing_meannd!(backend,block_size)(src,m,dims;ndrange=length(src))
-            end
-        else
-            if N ==1
-                foreachindex(src; max_tasks=max_tasks, min_elems=min_elems) do i
-                    src[i] -= m
-                end
-            else
-                _slicing_meannd_cpu!(src,m,dims;max_tasks=max_tasks,min_elems=min_elems)
-            end
-        end
-    end
-    ntmp = isnothing(dims) ? nothing : m
-    res = mapreduce(x->x*x,+,src,backend;
-        init=zero(eltype(src)),
-        dims=dims,
-        max_tasks=max_tasks,
-        min_elems=min_elems,
-        prefer_threads=prefer_threads,
-        block_size=block_size,
-        temp=ntmp,
-        switch_below=switch_below)
-    if isnothing(dims) || N == 1
-        res /= (length(src) - ifelse(corrected , 1 , 0))
-        return res 
+        # stats is an array of (Int32, Float32, Float32); finalize into Float32 array
+        out = similar(stats, Float32)
+        AcceleratedKernels.map!(
+            s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))),
+            out, stats, backend;
+            max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
+        )
+        return out
     end
-    s = eltype(src)(1)/ (size(src,dims) - ifelse(corrected , 1 , 0))
-    res .*= s
-    return res
 end
 
-"""
-    var!(
-        src::AbstractArray{T}, backend::Backend=get_backend(src);
-        dims::Union{Nothing, Int}=nothing,
-        corrected ::Bool = true,
-
-        # CPU settings
-        max_tasks::Int=Threads.nthreads(),
-        min_elems::Int=1,
-        prefer_threads=prefer_threads,
-
-        # GPU settings
-        block_size::Int=256,
-        temp::Union{Nothing, AbstractArray}=nothing,
-        switch_below::Int=0,
-    ) where {T<:Real}
-
-    Compute the varience of `src` along dimensions `dims`.
-    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
-    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
-    if it is an integer type.
-    ## CPU settings
-    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
-    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
-    cases are scaling linearly with the number of threads.
-
-    ## GPU settings
-    The `block_size` parameter controls the number of threads per block.
-
-    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
-    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
-    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
-    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
-    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
-    zero, check against `Base.reduce` for CPU arrays for exact behavior.
 
-    The `switch_below` parameter controls the threshold below which the reduction is performed on the
-    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
-"""
 function var(
-    src::AbstractArray{T},backend::Backend=get_backend(src);
-    dims::Union{Nothing, Int}=nothing,
-    corrected ::Bool = true,
-    # CPU settings - ignored here
-    max_tasks::Int = Threads.nthreads(),
-    min_elems::Int = 1,
+    src::AbstractArray{T,N}, backend::Backend=get_backend(src);
+    dims::Union{Nothing,Int}=nothing,
+    corrected::Bool=true,
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
     prefer_threads::Bool=true,
-
-    # GPU settings
-    block_size::Int = 256,
-    temp::Union{Nothing, AbstractArray} = nothing,
+    block_size::Int=256,
+    temp::Union{Nothing,AbstractArray}=nothing,  # ignored
     switch_below::Int=0,
-)  where {T<:Real}  
-    return var!(copy(src),backend;
+) where {T<:AbstractFloat,N}
+
+    init   = (0, zero(T), zero(T))
+    mapper = x -> (1, x, zero(typeof(x)))
+
+    stats = mapreduce(
+        mapper, chan_merge, src, backend;
+        init=init, neutral=init,
         dims=dims,
-        corrected = corrected,
-        # CPU settings - ignored here
-        max_tasks = max_tasks,
-        min_elems = min_elems,
-        prefer_threads = prefer_threads,
-        # GPU settings
-        block_size = block_size,
-        temp = temp,
-        switch_below = switch_below
-    ) 
-end
\ No newline at end of file
+        max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=nothing,
+        switch_below=switch_below,
+    )
+
+    if dims === nothing
+        n, _, M2 = stats
+        return M2 / T(n - ifelse(corrected , 1 , 0))
+    else
+        # stats is an array of (Int32, Float32, Float32); finalize into Float32 array
+        out = similar(stats, T)
+        AcceleratedKernels.map!(
+            s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))),
+            out, stats, backend;
+            max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
+        )
+        return out
+    end
+end
diff --git a/test/statistics.jl b/test/statistics.jl
index f0d9ccd..5ab3741 100644
--- a/test/statistics.jl
+++ b/test/statistics.jl
@@ -128,11 +128,6 @@ end
                     dh = Array(d)
                     @test dh ≈ var(sh;  dims)
                     @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32
-                    tv =  var(sh;  dims)
-                    d = AK.var!(s; prefer_threads,  dims)
-                    dh = Array(d)
-                    @test dh ≈ tv
-                    @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32
                 end
             end
         end
@@ -214,7 +209,7 @@ end
         dims=2,
         corrected=true,
         block_size=64,
-        temp=array_from_host(zeros(Float32, 3, 1, 5)),
+        temp=nothing,
         switch_below=50,
         max_tasks=10,
         min_elems=100,
@@ -225,7 +220,7 @@ end
         dims=3,
         corrected=false,
         block_size=64,
-        temp=array_from_host(zeros(Float32, 3, 4, 1)),
+        temp = nothing,
         switch_below=50,
         max_tasks=16,
         min_elems=1000,

From bb68e5199e19fa5bcda4c5a4fc133150ace056a2 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 23:51:36 +0200
Subject: [PATCH 18/23] add doc back

---
 src/statistics/var.jl | 46 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/src/statistics/var.jl b/src/statistics/var.jl
index 1b1fbd5..f6d3a66 100644
--- a/src/statistics/var.jl
+++ b/src/statistics/var.jl
@@ -1,4 +1,4 @@
-@inline function chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real}
+@inline function _chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real}
     nA, mA, M2A = a
     nB, mB, M2B = b
     if nA == 0
@@ -14,7 +14,45 @@
         return (nAB, mean, M2A + M2B + cross)
     end
 end
+"""
+    var(
+        src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+        corrected ::Bool = true,
 
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the varience of `src` along dimensions `dims`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
 function var(
     src::AbstractArray{T,N}, backend::Backend=get_backend(src);
     dims::Union{Nothing,Int}=nothing,
@@ -31,7 +69,7 @@ function var(
     mapper = x -> (1, Float32(x), 0f0)
 
     stats = mapreduce(
-        mapper, chan_merge, src, backend;
+        mapper, _chan_merge, src, backend;
         init=init, neutral=init,
         dims=dims,
         max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
@@ -44,7 +82,6 @@ function var(
         n, _, M2 = stats
         return M2 / Float32(n - ifelse(corrected , 1 , 0))
     else
-        # stats is an array of (Int32, Float32, Float32); finalize into Float32 array
         out = similar(stats, Float32)
         AcceleratedKernels.map!(
             s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))),
@@ -72,7 +109,7 @@ function var(
     mapper = x -> (1, x, zero(typeof(x)))
 
     stats = mapreduce(
-        mapper, chan_merge, src, backend;
+        mapper, _chan_merge, src, backend;
         init=init, neutral=init,
         dims=dims,
         max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
@@ -85,7 +122,6 @@ function var(
         n, _, M2 = stats
         return M2 / T(n - ifelse(corrected , 1 , 0))
     else
-        # stats is an array of (Int32, Float32, Float32); finalize into Float32 array
         out = similar(stats, T)
         AcceleratedKernels.map!(
             s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))),

From face574d73c8c444c82dff690f9c348bac0069fc Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Tue, 19 Aug 2025 23:56:15 +0200
Subject: [PATCH 19/23] avoid rm new line

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5bb7f17..af19fcc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -75,4 +75,4 @@ include("sort.jl")
 include("reduce.jl")
 include("accumulate.jl")
 include("predicates.jl")
-include("binarysearch.jl")
\ No newline at end of file
+include("binarysearch.jl")

From f93a70873536dd7c3c5380c5fd967683b7fd9f45 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Wed, 20 Aug 2025 00:03:29 +0200
Subject: [PATCH 20/23] another new line

---
 src/AcceleratedKernels.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index ce48e1f..37a30ea 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -36,4 +36,5 @@ include("predicates.jl")
 include("arithmetics.jl")
 include("statistics/statistics.jl")
 
+
 end     # module AcceleratedKernels

From 2e0967d09363431a3bd79e9b2a39aa2325a631cd Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Wed, 20 Aug 2025 09:14:41 +0200
Subject: [PATCH 21/23] other lines

---
 src/statistics/mean.jl   | 2 +-
 src/statistics/median.jl | 2 +-
 src/statistics/middle.jl | 2 +-
 test/statistics.jl       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
index c004e95..cae47c9 100644
--- a/src/statistics/mean.jl
+++ b/src/statistics/mean.jl
@@ -93,4 +93,4 @@ function mean(
     else
         return res./size(src,dims)
     end
-end
\ No newline at end of file
+end
diff --git a/src/statistics/median.jl b/src/statistics/median.jl
index 5dd411f..b3a65aa 100644
--- a/src/statistics/median.jl
+++ b/src/statistics/median.jl
@@ -1 +1 @@
-# need to do the middle kernel first
\ No newline at end of file
+# need to do the middle kernel first
diff --git a/src/statistics/middle.jl b/src/statistics/middle.jl
index 5486841..38bee94 100644
--- a/src/statistics/middle.jl
+++ b/src/statistics/middle.jl
@@ -1 +1 @@
-# need to sort multi dimensional arrays first
\ No newline at end of file
+# need to sort multi dimensional arrays first
diff --git a/test/statistics.jl b/test/statistics.jl
index 5ab3741..7bccd91 100644
--- a/test/statistics.jl
+++ b/test/statistics.jl
@@ -225,4 +225,4 @@ end
         max_tasks=16,
         min_elems=1000,
     )
-end
\ No newline at end of file
+end

From bc7869f62a0de4736db31d7ea2792d4cc3c0b69d Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Wed, 20 Aug 2025 09:15:55 +0200
Subject: [PATCH 22/23] other lines

---
 src/statistics/statistics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl
index 46d413d..48c4e31 100644
--- a/src/statistics/statistics.jl
+++ b/src/statistics/statistics.jl
@@ -7,4 +7,4 @@ include("quantile.jl")
 include("std.jl")
 include("stdm.jl")
 include("var.jl")
-include("varm.jl")
\ No newline at end of file
+include("varm.jl")

From 1b77f18ca9452847552035355c4f2a1380273af8 Mon Sep 17 00:00:00 2001
From: yolhan83 <yolhan@laposte.net>
Date: Wed, 20 Aug 2025 09:17:13 +0200
Subject: [PATCH 23/23] only keep mean and var for now

---
 src/statistics/cor.jl        | 0
 src/statistics/cov.jl        | 0
 src/statistics/median.jl     | 1 -
 src/statistics/middle.jl     | 1 -
 src/statistics/quantile.jl   | 0
 src/statistics/statistics.jl | 8 --------
 src/statistics/std.jl        | 0
 src/statistics/stdm.jl       | 0
 src/statistics/varm.jl       | 0
 9 files changed, 10 deletions(-)
 delete mode 100644 src/statistics/cor.jl
 delete mode 100644 src/statistics/cov.jl
 delete mode 100644 src/statistics/median.jl
 delete mode 100644 src/statistics/middle.jl
 delete mode 100644 src/statistics/quantile.jl
 delete mode 100644 src/statistics/std.jl
 delete mode 100644 src/statistics/stdm.jl
 delete mode 100644 src/statistics/varm.jl

diff --git a/src/statistics/cor.jl b/src/statistics/cor.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/statistics/cov.jl b/src/statistics/cov.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/statistics/median.jl b/src/statistics/median.jl
deleted file mode 100644
index b3a65aa..0000000
--- a/src/statistics/median.jl
+++ /dev/null
@@ -1 +0,0 @@
-# need to do the middle kernel first
diff --git a/src/statistics/middle.jl b/src/statistics/middle.jl
deleted file mode 100644
index 38bee94..0000000
--- a/src/statistics/middle.jl
+++ /dev/null
@@ -1 +0,0 @@
-# need to sort multi dimensional arrays first
diff --git a/src/statistics/quantile.jl b/src/statistics/quantile.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl
index 48c4e31..6f78f1b 100644
--- a/src/statistics/statistics.jl
+++ b/src/statistics/statistics.jl
@@ -1,10 +1,2 @@
-include("median.jl")
 include("mean.jl")
-include("cor.jl")
-include("cov.jl")
-include("middle.jl")
-include("quantile.jl")
-include("std.jl")
-include("stdm.jl")
 include("var.jl")
-include("varm.jl")
diff --git a/src/statistics/std.jl b/src/statistics/std.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/statistics/stdm.jl b/src/statistics/stdm.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/statistics/varm.jl b/src/statistics/varm.jl
deleted file mode 100644
index e69de29..0000000