diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index a441514..d889c9e 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -7,4 +7,5 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
index 93736bb..dbc7d20 100644
--- a/benchmark/runbenchmarks.jl
+++ b/benchmark/runbenchmarks.jl
@@ -2,6 +2,7 @@ import AcceleratedKernels as AK
 using KernelAbstractions
 using GPUArrays
 
+using Statistics
 using BenchmarkTools
 using BenchmarkPlots, StatsPlots, FileIO
 using StableRNGs
diff --git a/benchmark/statistics.jl b/benchmark/statistics.jl
new file mode 100644
index 0000000..2bcc8e2
--- /dev/null
+++ b/benchmark/statistics.jl
@@ -0,0 +1,25 @@
+group = addgroup!(SUITE, "statistics")
+
+n1d = 1_000_000
+n3d = 100
+for T in [UInt32, Int64, Float32]
+    local _group = addgroup!(group, "mean $T")
+
+    local randrange = T == Float32 ? T : T(1):T(100)
+
+    _group["mean1d_statistics"] = @benchmarkable @sb(Statistics.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+    _group["mean1d_ak"] = @benchmarkable @sb(AK.mean(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+
+    
+    _group["meannd_statistics"] = @benchmarkable @sb(Statistics.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+    _group["meannd_ak"] = @benchmarkable @sb(AK.mean(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+
+    local _group = addgroup!(group, "var $T")
+    _group["var1d_statistics"] = @benchmarkable @sb(Statistics.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+    _group["var1d_ak"] = @benchmarkable @sb(AK.var(v)) setup=(v = ArrayType(rand(rng, $randrange, n1d)))
+
+    
+    _group["varnd_statistics"] = @benchmarkable @sb(Statistics.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+    _group["varnd_ak"] = @benchmarkable @sb(AK.var(v,dims=3)) setup=(v = ArrayType(rand(rng, $randrange, n3d,n3d,n3d)))
+
+end
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index d662c2a..37a30ea 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -34,6 +34,7 @@ include("accumulate/accumulate.jl")
 include("searchsorted.jl")
 include("predicates.jl")
 include("arithmetics.jl")
+include("statistics/statistics.jl")
 
 
 end     # module AcceleratedKernels
diff --git a/src/statistics/mean.jl b/src/statistics/mean.jl
new file mode 100644
index 0000000..cae47c9
--- /dev/null
+++ b/src/statistics/mean.jl
@@ -0,0 +1,96 @@
+"""
+    mean(
+        f, src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the mean of `src` along dimensions `dims` after applying `f`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function mean(
+    f::Function,src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    init = T<:Integer ? zero(Float32) : zero(T)
+    res = mapreduce(f,+,src,backend;
+            init=init,
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    if isnothing(dims)
+        return res./length(src)
+    else
+        return res./size(src,dims)
+    end
+end
+
+function mean(
+    src::AbstractArray{T},backend::Backend=get_backend(src);
+    dims::Union{Nothing, Int}=nothing,
+    # CPU settings - ignored here
+    max_tasks::Int = Threads.nthreads(),
+    min_elems::Int = 1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int = 256,
+    temp::Union{Nothing, AbstractArray} = nothing,
+    switch_below::Int=0,
+)  where {T<:Real}  
+    init = T<:Integer ? zero(Float32) : zero(T)
+    res = reduce(+,src,backend;
+            init=init,
+            dims=dims,
+            max_tasks=max_tasks,
+            min_elems=min_elems,
+            prefer_threads=prefer_threads,
+            block_size=block_size,
+            temp=temp,
+            switch_below=switch_below)
+    if isnothing(dims)
+        return res./length(src)
+    else
+        return res./size(src,dims)
+    end
+end
diff --git a/src/statistics/statistics.jl b/src/statistics/statistics.jl
new file mode 100644
index 0000000..6f78f1b
--- /dev/null
+++ b/src/statistics/statistics.jl
@@ -0,0 +1,2 @@
+include("mean.jl")
+include("var.jl")
diff --git a/src/statistics/var.jl b/src/statistics/var.jl
new file mode 100644
index 0000000..f6d3a66
--- /dev/null
+++ b/src/statistics/var.jl
@@ -0,0 +1,133 @@
+@inline function _chan_merge(a::Tuple{Int64,T,T}, b::Tuple{Int64,T,T}) where {T<:Real}
+    nA, mA, M2A = a
+    nB, mB, M2B = b
+    if nA == 0
+        return b
+    elseif nB == 0
+        return a
+    else
+        nAB  = nA + nB                          
+        δ    = mB - mA                          
+        invn = inv(T(nAB))                
+        mean = (nA*mA + nB*mB) * invn
+        cross = (δ*δ) * (nA*nB * invn)
+        return (nAB, mean, M2A + M2B + cross)
+    end
+end
+"""
+    var(
+        src::AbstractArray{T}, backend::Backend=get_backend(src);
+        dims::Union{Nothing, Int}=nothing,
+        corrected ::Bool = true,
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+        temp::Union{Nothing, AbstractArray}=nothing,
+        switch_below::Int=0,
+    ) where {T<:Real}
+
+    Compute the varience of `src` along dimensions `dims`.
+    If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
+    dimension. The return type will be the same as the element type of `src` if it is a float type, or `Float32`
+    if it is an integer type.
+    ## CPU settings
+    Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
+    arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
+    cases are scaling linearly with the number of threads.
+
+    ## GPU settings
+    The `block_size` parameter controls the number of threads per block.
+
+    The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
+    (`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
+    required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
+    array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
+    except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
+    zero, check against `Base.reduce` for CPU arrays for exact behavior.
+
+    The `switch_below` parameter controls the threshold below which the reduction is performed on the
+    CPU and is only used for 1D reductions (i.e. `dims=nothing`).
+"""
+function var(
+    src::AbstractArray{T,N}, backend::Backend=get_backend(src);
+    dims::Union{Nothing,Int}=nothing,
+    corrected::Bool=true,
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+    temp::Union{Nothing,AbstractArray}=nothing,  # ignored
+    switch_below::Int=0,
+) where {T<:Integer,N}
+
+    init   = (0, 0f0, 0f0)
+    mapper = x -> (1, Float32(x), 0f0)
+
+    stats = mapreduce(
+        mapper, _chan_merge, src, backend;
+        init=init, neutral=init,
+        dims=dims,
+        max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=nothing,
+        switch_below=switch_below,
+    )
+
+    if dims === nothing
+        n, _, M2 = stats
+        return M2 / Float32(n - ifelse(corrected , 1 , 0))
+    else
+        out = similar(stats, Float32)
+        AcceleratedKernels.map!(
+            s -> @inbounds(s[3] / Float32(s[1] - ifelse(corrected , 1 , 0))),
+            out, stats, backend;
+            max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
+        )
+        return out
+    end
+end
+
+
+function var(
+    src::AbstractArray{T,N}, backend::Backend=get_backend(src);
+    dims::Union{Nothing,Int}=nothing,
+    corrected::Bool=true,
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+    temp::Union{Nothing,AbstractArray}=nothing,  # ignored
+    switch_below::Int=0,
+) where {T<:AbstractFloat,N}
+
+    init   = (0, zero(T), zero(T))
+    mapper = x -> (1, x, zero(typeof(x)))
+
+    stats = mapreduce(
+        mapper, _chan_merge, src, backend;
+        init=init, neutral=init,
+        dims=dims,
+        max_tasks=max_tasks, min_elems=min_elems, prefer_threads=prefer_threads,
+        block_size=block_size,
+        temp=nothing,
+        switch_below=switch_below,
+    )
+
+    if dims === nothing
+        n, _, M2 = stats
+        return M2 / T(n - ifelse(corrected , 1 , 0))
+    else
+        out = similar(stats, T)
+        AcceleratedKernels.map!(
+            s -> @inbounds(s[3] / (s[1] - ifelse(corrected , 1 , 0))),
+            out, stats, backend;
+            max_tasks=max_tasks, min_elems=min_elems, block_size=block_size,
+        )
+        return out
+    end
+end
diff --git a/test/Project.toml b/test/Project.toml
index d77e276..8ce8029 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -4,4 +4,5 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 716fd8e..af19fcc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,6 @@
 import AcceleratedKernels as AK
 using KernelAbstractions
+using Statistics
 using Test
 using Random
 import Pkg
@@ -66,6 +67,7 @@ end
     Aqua.test_all(AK)
 end
 
+include("statistics.jl")
 include("partition.jl")
 include("looping.jl")
 include("map.jl")
diff --git a/test/statistics.jl b/test/statistics.jl
new file mode 100644
index 0000000..7bccd91
--- /dev/null
+++ b/test/statistics.jl
@@ -0,0 +1,228 @@
+@testset "mean" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Statistics.mean
+    for dims in 1:3
+        for isize in 1:3
+            for jsize in 1:3
+                for ksize in 1:3
+                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.mean(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ mean(sh;  dims)
+                    @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32
+                    d = AK.mean(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ mean(sh;  dims)
+                    @test eltype(dh) == eltype(mean(sh; dims)) || eltype(dh) == Float32
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mean(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mean(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mean(v; prefer_threads,  dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:4
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(1:100)
+            s = AK.mean(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ mean(vh; dims)
+        end
+    end
+
+    # Test that undefined kwargs are not accepted
+    @test_throws MethodError AK.mean(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg)
+
+    # Testing different settings
+    AK.mean(
+        x->x ,
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=nothing,
+        block_size=64,
+        temp=nothing,
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.mean(
+        x->x ,
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=2,
+        block_size=64,
+        temp=array_from_host(zeros(Float32, 3, 1, 5)),
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.mean(
+        x->x ,
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=3,
+        block_size=64,
+        temp=array_from_host(zeros(Float32, 3, 4, 1)),
+        switch_below=50,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end
+
+@testset "var" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Statistics.var
+    for dims in 1:3
+        for isize in 2:3
+            for jsize in 2:3
+                for ksize in 2:3
+                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.var(s; prefer_threads,  dims)
+                    dh = Array(d)
+                    @test dh ≈ var(sh;  dims)
+                    @test eltype(dh) == eltype(var(sh; dims)) || eltype(dh) == Float32
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.var(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.var(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.var(v; prefer_threads,  dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(2:100)
+            n2 = rand(2:100)
+            n3 = rand(2:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(1:100)
+            s = AK.var(v; prefer_threads, dims)
+            sh = Array(s)
+            @test sh ≈ var(vh; dims)
+        end
+    end
+
+    # Test that undefined kwargs are not accepted
+    @test_throws MethodError AK.var(array_from_host(rand(Int32, 10, 10)); prefer_threads,bad=:kwarg)
+
+    # Testing different settings
+    AK.var(
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=nothing,
+        corrected=true,
+        block_size=64,
+        temp=nothing,
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.var(
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=2,
+        corrected=true,
+        block_size=64,
+        temp=nothing,
+        switch_below=50,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.var(
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
+        dims=3,
+        corrected=false,
+        block_size=64,
+        temp = nothing,
+        switch_below=50,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end