From f2f83cff5eda14f8afa4bd86f5e98ed726013086 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 15 Jan 2022 12:40:42 -0800 Subject: [PATCH 01/66] Add weighted `sem` --- src/scalarstats.jl | 38 ++++++++++++++++++++++++++++++++++---- test/scalarstats.jl | 4 ++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 924d681cf..ef0c2f877 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -261,10 +261,17 @@ realXcY(x::Real, y::Real) = x*y realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) """ - sem(x) + sem(x[, weights::AbstractWeights]; mean=nothing) -Return the standard error of the mean of collection `x`, -i.e. `sqrt(var(x, corrected=true) / length(x))`. +Return the standard error of the mean for a collection `x`. When using no weights, this is +equal to the (sample) standard deviation divided by the sample size. If weights are used, +the variance of the sample mean is calculated as follows: + +* `AnalyticWeights`: Not implemented. +* `FrequencyWeights`: ``\\frac{\\sum_{i=1}^n w_i (x_i - \\bar{x_i})^2}{(\\sum w_i) (\\sum w_i - 1)}`` +* `ProbabilityWeights`: ``\\frac{n}{n-1} \\frac{\\sum_{i=1}^n w_i^2 (x_i - \\bar{x_i})^2}{\\left( \\sum w_i \\right)^2}`` + +The standard error is then the square root of the above quantities. """ function sem(x) y = iterate(x) @@ -290,7 +297,30 @@ function sem(x) M = new_M end var = S / (count - 1) - return sqrt(var/count) + return sqrt(var / count) +end + +function sem(x, weights::UnitWeights; kwargs...) + if length(x) == length(weights) + T = promote(eltype(x), eltype(weights)) + return sem(Iterators.map(T, x); kwargs...) + else + throw(ArgumentError("data and weights must be the same length")) + end +end + +function sem(x, weights::FrequencyWeights) + return sqrt(var(x, weights; corrected=true) / weights.sum) +end + +function sem(x, weights::ProbabilityWeights) + n = length(x) + n == length(weights) || error("data and weights must be the same length") + μ = mean(x, weights) + var = sum(zip(x, weights)) do (x, w) + return (w * (x - μ))^2 + end + return sqrt(var * n / (n - 1)) / weights.sum end # Median absolute deviation diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 6b561a130..ea6e88d69 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -109,6 +109,10 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem([1:5;]) ≈ 0.707106781186548 @test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548 +@test sem([1:5;], UnitWeights{Int}(5)) ≈ 0.707106781186548 +@test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.01 +@test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.440215 rtol=.01 +@test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]) @test sem(Int[]) === NaN @test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN @test_throws MethodError sem(Any[]) From 46be952b75b63e0eb4d267ef7d92576067a7a4f1 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 15 Jan 2022 12:42:33 -0800 Subject: [PATCH 02/66] add weighted sem method --- test/scalarstats.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index ea6e88d69..de7b865e8 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -110,8 +110,8 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem([1:5;]) ≈ 0.707106781186548 @test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548 @test sem([1:5;], UnitWeights{Int}(5)) ≈ 0.707106781186548 -@test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.01 -@test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.440215 rtol=.01 +@test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.001 +@test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.440215 rtol=.001 @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]) @test sem(Int[]) === NaN @test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN From 158fb35487c6b2522ae3a8d996aa7b730c90742a Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 14:35:45 -0800 Subject: [PATCH 03/66] Correct missing method --- src/scalarstats.jl | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index ef0c2f877..7a706098f 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -260,6 +260,22 @@ variation(x) = ((m, s) = mean_and_std(x); s/m) realXcY(x::Real, y::Real) = x*y realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) + +""" + var_sample_mean(x[, weights::AbstractWeights]; mean=nothing) + +Return the estimated variance of the mean for a collection `x`. When using no weights, this is +equal to the (sample) standard deviation divided by the sample size. If weights are used, +the variance of the sample mean is calculated as follows: + +* `AnalyticWeights`: Not implemented. +* `FrequencyWeights`: ``\\frac{\\sum_{i=1}^n w_i (x_i - \\bar{x_i})^2}{(\\sum w_i) (\\sum w_i - 1)}`` +* `ProbabilityWeights`: ``\\frac{n}{n-1} \\frac{\\sum_{i=1}^n w_i^2 (x_i - \\bar{x_i})^2}{\\left( \\sum w_i \\right)^2}`` + +The standard error is then the square root of the above quantities. +""" + + """ sem(x[, weights::AbstractWeights]; mean=nothing) @@ -273,6 +289,8 @@ the variance of the sample mean is calculated as follows: The standard error is then the square root of the above quantities. """ +sem(x; mean::Number) = sqrt(varm(x, mean) / count) +sem(x; mean::Nothing) = sem(x) function sem(x) y = iterate(x) if y === nothing @@ -309,13 +327,10 @@ function sem(x, weights::UnitWeights; kwargs...) end end -function sem(x, weights::FrequencyWeights) - return sqrt(var(x, weights; corrected=true) / weights.sum) -end +# Weighted methods for the above +sem(x, weights::FrequencyWeights) = sqrt(var(x, weights; corrected=true) / weights.sum) function sem(x, weights::ProbabilityWeights) - n = length(x) - n == length(weights) || error("data and weights must be the same length") μ = mean(x, weights) var = sum(zip(x, weights)) do (x, w) return (w * (x - μ))^2 From 43c74193ad8589eaefbd6c251f03cf4e69074d86 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 14:46:52 -0800 Subject: [PATCH 04/66] bug fix --- src/scalarstats.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 7a706098f..c14b10843 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -289,8 +289,7 @@ the variance of the sample mean is calculated as follows: The standard error is then the square root of the above quantities. """ -sem(x; mean::Number) = sqrt(varm(x, mean) / count) -sem(x; mean::Nothing) = sem(x) +sem(x; mean::Number) = sqrt(var(x; mean = mean, corrected=true) / length(x)) function sem(x) y = iterate(x) if y === nothing From e770708ed5d3b401a2c5571fe584d6afa06a4146 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 14:56:55 -0800 Subject: [PATCH 05/66] Bug fix --- src/scalarstats.jl | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index c14b10843..02eaf047e 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -289,33 +289,7 @@ the variance of the sample mean is calculated as follows: The standard error is then the square root of the above quantities. """ -sem(x; mean::Number) = sqrt(var(x; mean = mean, corrected=true) / length(x)) -function sem(x) - y = iterate(x) - if y === nothing - T = eltype(x) - # Return the NaN of the type that we would get, had this collection - # contained any elements (this is consistent with std) - return oftype(sqrt((abs2(zero(T)) + abs2(zero(T)))/2), NaN) - end - count = 1 - value, state = y - y = iterate(x, state) - # Use Welford algorithm as seen in (among other places) - # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - M = value / 1 - S = real(zero(M)) - while y !== nothing - value, state = y - y = iterate(x, state) - count += 1 - new_M = M + (value - M) / count - S = S + realXcY(value - M, value - new_M) - M = new_M - end - var = S / (count - 1) - return sqrt(var / count) -end +sem(x; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) function sem(x, weights::UnitWeights; kwargs...) if length(x) == length(weights) From e5a03a8c6b5ee23a490574f356cda392a78c4305 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 15:07:55 -0800 Subject: [PATCH 06/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 02eaf047e..59309ee6e 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -291,12 +291,10 @@ The standard error is then the square root of the above quantities. """ sem(x; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) -function sem(x, weights::UnitWeights; kwargs...) - if length(x) == length(weights) - T = promote(eltype(x), eltype(weights)) - return sem(Iterators.map(T, x); kwargs...) - else - throw(ArgumentError("data and weights must be the same length")) +function sem(x, weights::UnitWeights; mean=nothing) + length(x) != length(weights) && throw(DimensionMismatch("inconsistent array dimension")) + return sem(x; mean=mean) +end end end From 82e4de26b741f6d776c5a334d01a4c47fe977154 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 15:11:52 -0800 Subject: [PATCH 07/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 59309ee6e..10a4af619 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -299,7 +299,9 @@ end end # Weighted methods for the above -sem(x, weights::FrequencyWeights) = sqrt(var(x, weights; corrected=true) / weights.sum) +function sem(x, weights::FrequencyWeights; mean=nothing) + return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) +end function sem(x, weights::ProbabilityWeights) μ = mean(x, weights) From 5eba9d4818181066f8c24ef03c7af476ad11e48c Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 15:16:20 -0800 Subject: [PATCH 08/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 10a4af619..22c415296 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -303,8 +303,7 @@ function sem(x, weights::FrequencyWeights; mean=nothing) return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) end -function sem(x, weights::ProbabilityWeights) - μ = mean(x, weights) +function sem(x, weights::ProbabilityWeights; mean=mean(x, weights)) var = sum(zip(x, weights)) do (x, w) return (w * (x - μ))^2 end From 2ef18ebdaac433e71993006848faa0cfd0cc5965 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 17 Jan 2022 19:10:24 -0800 Subject: [PATCH 09/66] =?UTF-8?q?change=20=CE=BC=20to=20mean?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/scalarstats.jl | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 22c415296..92514b573 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -261,21 +261,6 @@ realXcY(x::Real, y::Real) = x*y realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) -""" - var_sample_mean(x[, weights::AbstractWeights]; mean=nothing) - -Return the estimated variance of the mean for a collection `x`. When using no weights, this is -equal to the (sample) standard deviation divided by the sample size. If weights are used, -the variance of the sample mean is calculated as follows: - -* `AnalyticWeights`: Not implemented. -* `FrequencyWeights`: ``\\frac{\\sum_{i=1}^n w_i (x_i - \\bar{x_i})^2}{(\\sum w_i) (\\sum w_i - 1)}`` -* `ProbabilityWeights`: ``\\frac{n}{n-1} \\frac{\\sum_{i=1}^n w_i^2 (x_i - \\bar{x_i})^2}{\\left( \\sum w_i \\right)^2}`` - -The standard error is then the square root of the above quantities. -""" - - """ sem(x[, weights::AbstractWeights]; mean=nothing) @@ -295,8 +280,6 @@ function sem(x, weights::UnitWeights; mean=nothing) length(x) != length(weights) && throw(DimensionMismatch("inconsistent array dimension")) return sem(x; mean=mean) end - end -end # Weighted methods for the above function sem(x, weights::FrequencyWeights; mean=nothing) @@ -305,7 +288,7 @@ end function sem(x, weights::ProbabilityWeights; mean=mean(x, weights)) var = sum(zip(x, weights)) do (x, w) - return (w * (x - μ))^2 + return (w * (x - mean))^2 end return sqrt(var * n / (n - 1)) / weights.sum end From 130720072b6201fc3acd84f237107c9877068236 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Tue, 18 Jan 2022 09:05:20 -0800 Subject: [PATCH 10/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 92514b573..4d92e97f4 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -290,7 +290,7 @@ function sem(x, weights::ProbabilityWeights; mean=mean(x, weights)) var = sum(zip(x, weights)) do (x, w) return (w * (x - mean))^2 end - return sqrt(var * n / (n - 1)) / weights.sum + return sqrt(var * (n / (n - 1))) / sum(weights) end # Median absolute deviation From 7aafe38d99ae214a48b21b61869494fd8db39d1a Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Tue, 18 Jan 2022 15:37:04 -0800 Subject: [PATCH 11/66] broadcast weights --- src/scalarstats.jl | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 4d92e97f4..e9ab8506a 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -262,7 +262,7 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) """ - sem(x[, weights::AbstractWeights]; mean=nothing) + sem(x[, weights::AbstractWeights]; [mean,]) Return the standard error of the mean for a collection `x`. When using no weights, this is equal to the (sample) standard deviation divided by the sample size. If weights are used, @@ -273,24 +273,36 @@ the variance of the sample mean is calculated as follows: * `ProbabilityWeights`: ``\\frac{n}{n-1} \\frac{\\sum_{i=1}^n w_i^2 (x_i - \\bar{x_i})^2}{\\left( \\sum w_i \\right)^2}`` The standard error is then the square root of the above quantities. + +# References + +Sribney, W. 1997. Probability weights, analytic weights, and summary statistics. Stata FAQ. """ sem(x; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) -function sem(x, weights::UnitWeights; mean=nothing) - length(x) != length(weights) && throw(DimensionMismatch("inconsistent array dimension")) +function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) + if length(x) ≠ length(weights) + throw(DimensionMismatch("array and weights do not have the same length")) + end return sem(x; mean=mean) end # Weighted methods for the above -function sem(x, weights::FrequencyWeights; mean=nothing) +function sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) end -function sem(x, weights::ProbabilityWeights; mean=mean(x, weights)) - var = sum(zip(x, weights)) do (x, w) - return (w * (x - mean))^2 +function sem(x::AbstractArray, weights::ProbabilityWeights; mean=mean(x, weights)) + n = length(x) + if n ≠ length(weights) + throw(DimensionMismatch("array and weights do not have the same length")) end - return sqrt(var * (n / (n - 1))) / sum(weights) + + # sum of squared errors = sse + sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w + return (w * (x_i - mean))^2 + end)) + return sqrt(sse * n / (n - 1)) / sum(weights) end # Median absolute deviation From aed6531cdc18a1771f0132004e02e9f1831c9f57 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Tue, 18 Jan 2022 18:07:38 -0800 Subject: [PATCH 12/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index e9ab8506a..44c7cd9af 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -280,7 +280,7 @@ Sribney, W. 1997. Probability weights, analytic weights, and summary statistics. """ sem(x; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) -function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) +function sem(x, weights::UnitWeights; mean=nothing) if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end From e49430c5b2c6c2fbe4d656b8f796ace1d87fd95f Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 19 Jan 2022 12:10:42 -0800 Subject: [PATCH 13/66] Work with arbitrary iterators, fuse loops --- src/moments.jl | 33 +++++++++++++++++++++++++++++++++ src/scalarstats.jl | 24 ++++++++++++------------ test/scalarstats.jl | 8 +++++++- 3 files changed, 52 insertions(+), 13 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 5b0a66147..5e9d4eccd 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -158,6 +158,39 @@ function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int; end +""" + n_mean_var(x; mean=nothing, corrected=true) -> (n, mean, var) + +Return the length, mean, and variance of `x`. +""" +function n_mean_var(x; mean=nothing, corrected=true) + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + n = 0 + variance = var(x) + mean = Statistics.mean(x) + elseif !isnothing(mean) + n, sse = mapreduce(.+, x) do element + return 1, abs2(element - mean) + end + variance = sse / (n - corrected) + else + # Use Welford algorithm as seen in (among other places) + # Knuth's TAOCP, Vol 2, page 232, 3rd edition. + n = 0 + mean = zero(eltype(x)) + sse = real(zero(mean)) + for element in x + n += 1 + new_mean = mean + (element - mean) / n + sse += realXcY(element - mean, element - new_mean) + mean = new_mean + end + variance = sse / (n - corrected) + end + return n, mean, variance +end + ##### General central moment function _moment2(v::RealArray, m::Real; corrected=false) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 44c7cd9af..ee4c9315d 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -278,9 +278,12 @@ The standard error is then the square root of the above quantities. Sribney, W. 1997. Probability weights, analytic weights, and summary statistics. Stata FAQ. """ -sem(x; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) +function sem(x; mean=nothing) + n, mean, var = n_mean_var(x; mean=mean, corrected=true) + return sqrt(var / n) +end -function sem(x, weights::UnitWeights; mean=nothing) +function sem(x::AbstractVector{<:Number}, weights::UnitWeights; mean=nothing) if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end @@ -288,20 +291,17 @@ function sem(x, weights::UnitWeights; mean=nothing) end # Weighted methods for the above -function sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) +function sem(x::AbstractVector{<:Number}, weights::FrequencyWeights; mean=nothing) return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) end -function sem(x::AbstractArray, weights::ProbabilityWeights; mean=mean(x, weights)) - n = length(x) - if n ≠ length(weights) - throw(DimensionMismatch("array and weights do not have the same length")) - end - +function sem(x::AbstractArray{<:Number}, weights::ProbabilityWeights; mean=nothing) + mean = (isnothing(mean) ? StatsBase.mean(x, weights) : mean) # sum of squared errors = sse - sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w - return (w * (x_i - mean))^2 - end)) + n, sse = reduce(.+, Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w + return !iszero(w), abs2(w * (x_i - mean)) + end) + ) return sqrt(sse * n / (n - 1)) / sum(weights) end diff --git a/test/scalarstats.jl b/test/scalarstats.jl index de7b865e8..7574158ef 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -110,8 +110,14 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem([1:5;]) ≈ 0.707106781186548 @test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548 @test sem([1:5;], UnitWeights{Int}(5)) ≈ 0.707106781186548 +@test sem([1:5;], UnitWeights{Int}(5); mean=mean(1:5)) ≈ 0.707106781186548 @test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.001 -@test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.440215 rtol=.001 +μ = mean(1:5, ProbabilityWeights([1:5;])) +@test sem([1:5;], ProbabilityWeights([1:5;]); mean=μ) ≈ 0.6166 rtol=.001 +@test sem([10; 1:5;], ProbabilityWeights([0; 1:5;]); mean=μ) ≈ 0.6166 rtol=.001 +@test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.452452 rtol=.001 +μ = mean([1:100;], ProbabilityWeights([1:100;])) +@test sem([1:100;], ProbabilityWeights([1:100;]); mean=μ) ≈ 2.452452 rtol=.001 @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]) @test sem(Int[]) === NaN @test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN From 99d850547d7418a006510a28cde2d488bf644d03 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 19 Jan 2022 13:52:07 -0800 Subject: [PATCH 14/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index ee4c9315d..f6910439c 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -283,7 +283,7 @@ function sem(x; mean=nothing) return sqrt(var / n) end -function sem(x::AbstractVector{<:Number}, weights::UnitWeights; mean=nothing) +function sem(x, weights::UnitWeights; mean=nothing) if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end From 5cd2e4d44ab9e20109e6f863052cc0109b391f9a Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 19 Jan 2022 13:52:43 -0800 Subject: [PATCH 15/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index f6910439c..3ebdd614c 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -291,7 +291,7 @@ function sem(x, weights::UnitWeights; mean=nothing) end # Weighted methods for the above -function sem(x::AbstractVector{<:Number}, weights::FrequencyWeights; mean=nothing) +function sem(x::RealArray, weights::FrequencyWeights; mean=nothing) return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) end From 43e354f280b74f3a4e9a4534ad0b211555cd1c40 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 19 Jan 2022 14:03:17 -0800 Subject: [PATCH 16/66] Apply review comments --- src/moments.jl | 2 +- src/scalarstats.jl | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 5e9d4eccd..e9b4ffe70 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -169,7 +169,7 @@ function n_mean_var(x; mean=nothing, corrected=true) n = 0 variance = var(x) mean = Statistics.mean(x) - elseif !isnothing(mean) + elseif mean ≢ nothing n, sse = mapreduce(.+, x) do element return 1, abs2(element - mean) end diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 3ebdd614c..a1869ae4e 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -296,12 +296,13 @@ function sem(x::RealArray, weights::FrequencyWeights; mean=nothing) end function sem(x::AbstractArray{<:Number}, weights::ProbabilityWeights; mean=nothing) - mean = (isnothing(mean) ? StatsBase.mean(x, weights) : mean) + _mean = (mean === nothing ? StatsBase.mean(x, weights) : mean) # sum of squared errors = sse - n, sse = reduce(.+, Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w - return !iszero(w), abs2(w * (x_i - mean)) + sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w + return abs2(w * (x_i - _mean)) end) ) + n = count(!iszero, weights) return sqrt(sse * n / (n - 1)) / sum(weights) end From 44d0b0ec60acdf69ea39efa73373d983fc122830 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 19 Jan 2022 14:04:39 -0800 Subject: [PATCH 17/66] RealArray instead of Array{<:Number} --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index a1869ae4e..bd0dbd627 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -295,7 +295,7 @@ function sem(x::RealArray, weights::FrequencyWeights; mean=nothing) return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) end -function sem(x::AbstractArray{<:Number}, weights::ProbabilityWeights; mean=nothing) +function sem(x::RealArray, weights::ProbabilityWeights; mean=nothing) _mean = (mean === nothing ? StatsBase.mean(x, weights) : mean) # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w From 094c30c2dc275e061d65d6d57ce627c98ce40dc9 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 22 Jan 2022 11:22:43 -0800 Subject: [PATCH 18/66] Improve citation --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index bd0dbd627..f615888d1 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -276,7 +276,7 @@ The standard error is then the square root of the above quantities. # References -Sribney, W. 1997. Probability weights, analytic weights, and summary statistics. Stata FAQ. +Carl-Erik Sarndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling (pp. 51-53). ISBN 9780387975283. """ function sem(x; mean=nothing) n, mean, var = n_mean_var(x; mean=mean, corrected=true) From a3c7e61216b382ad1b6b809387474dc1946f2cb1 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 11:06:39 -0800 Subject: [PATCH 19/66] Apply code review suggestions --- src/moments.jl | 34 ---------------------------------- src/scalarstats.jl | 42 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index e9b4ffe70..a069d203a 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -158,40 +158,6 @@ function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int; end -""" - n_mean_var(x; mean=nothing, corrected=true) -> (n, mean, var) - -Return the length, mean, and variance of `x`. -""" -function n_mean_var(x; mean=nothing, corrected=true) - if isempty(x) - # Return the NaN of the type that we would get for a nonempty x - n = 0 - variance = var(x) - mean = Statistics.mean(x) - elseif mean ≢ nothing - n, sse = mapreduce(.+, x) do element - return 1, abs2(element - mean) - end - variance = sse / (n - corrected) - else - # Use Welford algorithm as seen in (among other places) - # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - n = 0 - mean = zero(eltype(x)) - sse = real(zero(mean)) - for element in x - n += 1 - new_mean = mean + (element - mean) / n - sse += realXcY(element - mean, element - new_mean) - mean = new_mean - end - variance = sse / (n - corrected) - end - return n, mean, variance -end - - ##### General central moment function _moment2(v::RealArray, m::Real; corrected=false) n = length(v) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index f615888d1..3aa846f42 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -262,7 +262,7 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) """ - sem(x[, weights::AbstractWeights]; [mean,]) + sem(x[, weights::AbstractWeights]; mean=nothing) Return the standard error of the mean for a collection `x`. When using no weights, this is equal to the (sample) standard deviation divided by the sample size. If weights are used, @@ -279,12 +279,44 @@ The standard error is then the square root of the above quantities. Carl-Erik Sarndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling (pp. 51-53). ISBN 9780387975283. """ function sem(x; mean=nothing) - n, mean, var = n_mean_var(x; mean=mean, corrected=true) - return sqrt(var / n) + n = 0 + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + variance = var(x; mean=mean, corrected=corrected) + mean = Statistics.mean(x) + elseif mean ≢ nothing + sse = real(zero(mean)) + @simd for element in x + n += 1 + sse += abs2(element - mean) + end + variance = sse / (n - corrected) + else + # Use Welford algorithm as seen in (among other places) + # Knuth's TAOCP, Vol 2, page 232, 3rd edition. + mean = zero(eltype(x)) + sse = real(mean) + for element in x + n += 1 + new_mean = mean + (element - mean) / n + sse += realXcY(element - mean, element - new_mean) + mean = new_mean + end + variance = sse / (n - corrected) + end + return sqrt(variance / n) end -function sem(x, weights::UnitWeights; mean=nothing) - if length(x) ≠ length(weights) +function sem(x::RealArray; mean=nothing) + var = var(x; mean=mean, corrected=true) + return sqrt(var / length(x)) +end + + + +function sem(x::RealArray, weights::UnitWeights; mean=nothing) + # must be an array, not an iterator, as iterators don't have length defined + if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end return sem(x; mean=mean) From 5d25c7aafe8ac8182816c9a388ccb27f1abff951 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 11:16:01 -0800 Subject: [PATCH 20/66] whitespace --- src/scalarstats.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 3aa846f42..b2984b85b 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -312,8 +312,6 @@ function sem(x::RealArray; mean=nothing) return sqrt(var / length(x)) end - - function sem(x::RealArray, weights::UnitWeights; mean=nothing) # must be an array, not an iterator, as iterators don't have length defined if length(x) ≠ length(weights) From 59f8b8e0a46dcb442337e91c201bb5093cdde127 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 11:57:22 -0800 Subject: [PATCH 21/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index b2984b85b..fbc4d424b 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -282,7 +282,7 @@ function sem(x; mean=nothing) n = 0 if isempty(x) # Return the NaN of the type that we would get for a nonempty x - variance = var(x; mean=mean, corrected=corrected) + variance = var(x; mean=mean, corrected=true) mean = Statistics.mean(x) elseif mean ≢ nothing sse = real(zero(mean)) From 73cfb64699a214527b63efc8ea2152227f0fdce2 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 11:57:29 -0800 Subject: [PATCH 22/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index fbc4d424b..da7f2a954 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -283,7 +283,6 @@ function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x variance = var(x; mean=mean, corrected=true) - mean = Statistics.mean(x) elseif mean ≢ nothing sse = real(zero(mean)) @simd for element in x From af78ffa2addbe9bb8a1b0477d029346783d2e4b1 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 11:57:35 -0800 Subject: [PATCH 23/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index da7f2a954..5c0320306 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -283,7 +283,7 @@ function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x variance = var(x; mean=mean, corrected=true) - elseif mean ≢ nothing + elseif mean !== nothing sse = real(zero(mean)) @simd for element in x n += 1 From 9264d0a3c196278793f93ad20787a04e3f14be20 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 11:58:02 -0800 Subject: [PATCH 24/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 5c0320306..212eec341 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -285,7 +285,7 @@ function sem(x; mean=nothing) variance = var(x; mean=mean, corrected=true) elseif mean !== nothing sse = real(zero(mean)) - @simd for element in x + for element in x n += 1 sse += abs2(element - mean) end From 50ee853460ae4a7764f7c52f4bf1011cf9198dc7 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 12:00:19 -0800 Subject: [PATCH 25/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 212eec341..392643aae 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -306,10 +306,7 @@ function sem(x; mean=nothing) return sqrt(variance / n) end -function sem(x::RealArray; mean=nothing) - var = var(x; mean=mean, corrected=true) - return sqrt(var / length(x)) -end +sem(x::RealArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) function sem(x::RealArray, weights::UnitWeights; mean=nothing) # must be an array, not an iterator, as iterators don't have length defined From c83beeacc4c9cef918df21ed7d2a0cf6a4ccf807 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 12:00:25 -0800 Subject: [PATCH 26/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 392643aae..201259e86 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -301,7 +301,7 @@ function sem(x; mean=nothing) sse += realXcY(element - mean, element - new_mean) mean = new_mean end - variance = sse / (n - corrected) + variance = sse / (n - 1) end return sqrt(variance / n) end From 4f25268ba460c9cc05927840baaf28a52dcde979 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 15:45:00 -0800 Subject: [PATCH 27/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 201259e86..79dd7f112 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -309,7 +309,6 @@ end sem(x::RealArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) function sem(x::RealArray, weights::UnitWeights; mean=nothing) - # must be an array, not an iterator, as iterators don't have length defined if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end From 9ef970a072e7845ebaf1335a1e0ff17cb81a36e8 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 12:38:16 -0800 Subject: [PATCH 28/66] Apply suggestions --- src/scalarstats.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 79dd7f112..ebc07508d 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -289,11 +289,11 @@ function sem(x; mean=nothing) n += 1 sse += abs2(element - mean) end - variance = sse / (n - corrected) + variance = sse / (n - 1) else # Use Welford algorithm as seen in (among other places) # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - mean = zero(eltype(x)) + mean = zero(first(x)) sse = real(mean) for element in x n += 1 @@ -321,7 +321,7 @@ function sem(x::RealArray, weights::FrequencyWeights; mean=nothing) end function sem(x::RealArray, weights::ProbabilityWeights; mean=nothing) - _mean = (mean === nothing ? StatsBase.mean(x, weights) : mean) + _mean = (mean === nothing ? Statistics.mean(x, weights) : mean) # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w return abs2(w * (x_i - _mean)) From 1ea8ef6ee2ffd8dc814214842361329b2f656ddf Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 15:57:12 -0800 Subject: [PATCH 29/66] formatting --- src/moments.jl | 1 + src/scalarstats.jl | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index a069d203a..5b0a66147 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -158,6 +158,7 @@ function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int; end + ##### General central moment function _moment2(v::RealArray, m::Real; corrected=false) n = length(v) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index ebc07508d..0642ae4d5 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -325,8 +325,7 @@ function sem(x::RealArray, weights::ProbabilityWeights; mean=nothing) # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w return abs2(w * (x_i - _mean)) - end) - ) + end)) n = count(!iszero, weights) return sqrt(sse * n / (n - 1)) / sum(weights) end From 3f2361f4aaa04f015c252fa8b4beb76c29e7b033 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 15:58:15 -0800 Subject: [PATCH 30/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 0642ae4d5..83233fbd1 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -316,9 +316,8 @@ function sem(x::RealArray, weights::UnitWeights; mean=nothing) end # Weighted methods for the above -function sem(x::RealArray, weights::FrequencyWeights; mean=nothing) - return sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) -end +sem(x::RealArray, weights::FrequencyWeights; mean=nothing) = + sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) function sem(x::RealArray, weights::ProbabilityWeights; mean=nothing) _mean = (mean === nothing ? Statistics.mean(x, weights) : mean) From b882c495bde0245966ff1e5e3534ef2f6fcbf40f Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 16:00:18 -0800 Subject: [PATCH 31/66] Drop real requirement --- src/scalarstats.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 83233fbd1..3bd218a1b 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -306,9 +306,9 @@ function sem(x; mean=nothing) return sqrt(variance / n) end -sem(x::RealArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) +sem(x::AbstractArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) -function sem(x::RealArray, weights::UnitWeights; mean=nothing) +function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end @@ -316,10 +316,10 @@ function sem(x::RealArray, weights::UnitWeights; mean=nothing) end # Weighted methods for the above -sem(x::RealArray, weights::FrequencyWeights; mean=nothing) = +sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) -function sem(x::RealArray, weights::ProbabilityWeights; mean=nothing) +function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) _mean = (mean === nothing ? Statistics.mean(x, weights) : mean) # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w From 18cf2552f0d19303daa017600a7a006f5e38d0ea Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 16:00:48 -0800 Subject: [PATCH 32/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 3bd218a1b..6dd8c8b7b 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -276,7 +276,8 @@ The standard error is then the square root of the above quantities. # References -Carl-Erik Sarndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling (pp. 51-53). ISBN 9780387975283. +Carl-Erik Särndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling. +New York: Springer. pp. 51-53. """ function sem(x; mean=nothing) n = 0 From 8b556bcd445adcd12a3c95ffbf26e972bbd3b606 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 16:05:40 -0800 Subject: [PATCH 33/66] Test mean keyword with FrequencyWeights --- test/scalarstats.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 7574158ef..76083e622 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -118,7 +118,10 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.452452 rtol=.001 μ = mean([1:100;], ProbabilityWeights([1:100;])) @test sem([1:100;], ProbabilityWeights([1:100;]); mean=μ) ≈ 2.452452 rtol=.001 -@test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]) +x = sort!(vcat([1:i for i in 1:5]...)) +μ = mean(x) +@test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) +@test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) @test sem(Int[]) === NaN @test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN @test_throws MethodError sem(Any[]) From dcff0b1a4fb6a4aa0fb7af5c82c09828183bbb0a Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 16:06:30 -0800 Subject: [PATCH 34/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 6dd8c8b7b..1bd7ca007 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -262,7 +262,8 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) """ - sem(x[, weights::AbstractWeights]; mean=nothing) + sem(x; mean=nothing) + sem(x::AbstractArray[, weights::AbstractWeights]; mean=nothing) Return the standard error of the mean for a collection `x`. When using no weights, this is equal to the (sample) standard deviation divided by the sample size. If weights are used, From 162631bf75a7a19dd66d3c97446d926cb3924657 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 23 Jan 2022 16:07:41 -0800 Subject: [PATCH 35/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 1bd7ca007..bfe474b79 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -260,7 +260,6 @@ variation(x) = ((m, s) = mean_and_std(x); s/m) realXcY(x::Real, y::Real) = x*y realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) - """ sem(x; mean=nothing) sem(x::AbstractArray[, weights::AbstractWeights]; mean=nothing) From d213c93507f5268ac145b64b7794d11abe195c84 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 24 Jan 2022 11:07:39 -0800 Subject: [PATCH 36/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index bfe474b79..fbe32e303 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -294,7 +294,7 @@ function sem(x; mean=nothing) else # Use Welford algorithm as seen in (among other places) # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - mean = zero(first(x)) + mean = zero(first(x))/1 sse = real(mean) for element in x n += 1 From a93a15d44b45a28006d9d5d5dbcd862792516256 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 24 Jan 2022 13:06:03 -0800 Subject: [PATCH 37/66] Use original `sem` implementation --- src/scalarstats.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index fbe32e303..4be7eb4c8 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -292,15 +292,19 @@ function sem(x; mean=nothing) end variance = sse / (n - 1) else + y = iterate(x) # Use Welford algorithm as seen in (among other places) # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - mean = zero(first(x))/1 - sse = real(mean) - for element in x + n = 1 + mean = value / 1 + sse = real(zero(mean)) + while y !== nothing + value, state = y n += 1 - new_mean = mean + (element - mean) / n - sse += realXcY(element - mean, element - new_mean) + new_mean = mean + (value - mean) / n + sse += S + realXcY(value - mean, value - new_mean) mean = new_mean + y = iterate(x, state) end variance = sse / (n - 1) end From 74d2c3c59d6a0d43b3f6d07a89fe07c77d6407e9 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 24 Jan 2022 13:07:06 -0800 Subject: [PATCH 38/66] use original SEM implementation --- src/scalarstats.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 4be7eb4c8..f9a527f88 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -292,19 +292,18 @@ function sem(x; mean=nothing) end variance = sse / (n - 1) else - y = iterate(x) + value, state = iterate(x) # Use Welford algorithm as seen in (among other places) # Knuth's TAOCP, Vol 2, page 232, 3rd edition. n = 1 mean = value / 1 sse = real(zero(mean)) while y !== nothing - value, state = y n += 1 new_mean = mean + (value - mean) / n sse += S + realXcY(value - mean, value - new_mean) mean = new_mean - y = iterate(x, state) + value, state = iterate(x, state) end variance = sse / (n - 1) end From adaf472c90191d848f39137e6501a9074a7563ee Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 24 Jan 2022 13:08:41 -0800 Subject: [PATCH 39/66] original sem implementation --- src/scalarstats.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index f9a527f88..d49d5240c 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -292,7 +292,8 @@ function sem(x; mean=nothing) end variance = sse / (n - 1) else - value, state = iterate(x) + y = iterate(x) + value, state = y # Use Welford algorithm as seen in (among other places) # Knuth's TAOCP, Vol 2, page 232, 3rd edition. n = 1 @@ -303,7 +304,8 @@ function sem(x; mean=nothing) new_mean = mean + (value - mean) / n sse += S + realXcY(value - mean, value - new_mean) mean = new_mean - value, state = iterate(x, state) + y = iterate(x, state) + value, state = y end variance = sse / (n - 1) end From acf81ea667a255a6e68755259df86a1b96ad9a29 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 24 Jan 2022 13:09:15 -0800 Subject: [PATCH 40/66] original sem implementation --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index d49d5240c..ab257a80e 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -302,7 +302,7 @@ function sem(x; mean=nothing) while y !== nothing n += 1 new_mean = mean + (value - mean) / n - sse += S + realXcY(value - mean, value - new_mean) + sse += realXcY(value - mean, value - new_mean) mean = new_mean y = iterate(x, state) value, state = y From 303c514f6ce1c841692f204fbcc20ff716a8703c Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 24 Jan 2022 13:14:23 -0800 Subject: [PATCH 41/66] test fix --- test/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 76083e622..99863d4a6 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -118,7 +118,7 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.452452 rtol=.001 μ = mean([1:100;], ProbabilityWeights([1:100;])) @test sem([1:100;], ProbabilityWeights([1:100;]); mean=μ) ≈ 2.452452 rtol=.001 -x = sort!(vcat([1:i for i in 1:5]...)) +x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) @test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) From 5bd4a910a312e62e1266e2d55338ebed1932110e Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 26 Jan 2022 19:36:59 -0800 Subject: [PATCH 42/66] Dispatch on mean --- src/scalarstats.jl | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index ab257a80e..bdcd91490 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -279,42 +279,54 @@ The standard error is then the square root of the above quantities. Carl-Erik Särndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling. New York: Springer. pp. 51-53. """ -function sem(x; mean=nothing) +sem(x, weights::AbstractWeights; mean=nothing) = _sem(x, weights, mean) +sem(x; mean=nothing) = _sem(x, mean) + +function _sem(x, mean) n = 0 if isempty(x) # Return the NaN of the type that we would get for a nonempty x variance = var(x; mean=mean, corrected=true) - elseif mean !== nothing + else sse = real(zero(mean)) for element in x n += 1 sse += abs2(element - mean) end variance = sse / (n - 1) + end + return sqrt(variance / n) +end + +function _sem(x, ::Nothing) + n = 0 + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + variance = var(x; corrected=true) else y = iterate(x) value, state = y # Use Welford algorithm as seen in (among other places) # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - n = 1 mean = value / 1 sse = real(zero(mean)) while y !== nothing + value, state = y + y = iterate(x, state) n += 1 new_mean = mean + (value - mean) / n sse += realXcY(value - mean, value - new_mean) mean = new_mean - y = iterate(x, state) - value, state = y end variance = sse / (n - 1) end return sqrt(variance / n) end -sem(x::AbstractArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) +_sem(x::AbstractArray, mean) = sqrt(var(x; mean=mean, corrected=true) / length(x)) +_sem(x::AbstractArray, ::Nothing) = sqrt(var(x; corrected=true) / length(x)) -function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) +function _sem(x::AbstractArray, weights::UnitWeights, mean) if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end @@ -322,14 +334,16 @@ function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) end # Weighted methods for the above -sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = +_sem(x::AbstractArray, weights::FrequencyWeights, mean) = sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) -function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) - _mean = (mean === nothing ? Statistics.mean(x, weights) : mean) +_sem(x::AbstractArray, weights::ProbabilityWeights, ::Nothing) = + _sem(x, weights, mean(x, weights)) + +function _sem(x::AbstractArray, weights::ProbabilityWeights, mean) # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w - return abs2(w * (x_i - _mean)) + return abs2(w * (x_i - mean)) end)) n = count(!iszero, weights) return sqrt(sse * n / (n - 1)) / sum(weights) From 8d904b5470658bcc384f31325f86b25f9c4fa4e9 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Thu, 27 Jan 2022 08:57:41 -0800 Subject: [PATCH 43/66] remove internal method for weights --- src/scalarstats.jl | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index bdcd91490..6876081a8 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -264,9 +264,9 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) sem(x; mean=nothing) sem(x::AbstractArray[, weights::AbstractWeights]; mean=nothing) -Return the standard error of the mean for a collection `x`. When using no weights, this is -equal to the (sample) standard deviation divided by the sample size. If weights are used, -the variance of the sample mean is calculated as follows: +Return the standard error of the mean for a collection `x`. When not using weights, this is +the (sample) standard deviation divided by the sample size. If weights are used, the +variance of the sample mean is calculated as follows: * `AnalyticWeights`: Not implemented. * `FrequencyWeights`: ``\\frac{\\sum_{i=1}^n w_i (x_i - \\bar{x_i})^2}{(\\sum w_i) (\\sum w_i - 1)}`` @@ -279,8 +279,15 @@ The standard error is then the square root of the above quantities. Carl-Erik Särndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling. New York: Springer. pp. 51-53. """ -sem(x, weights::AbstractWeights; mean=nothing) = _sem(x, weights, mean) sem(x; mean=nothing) = _sem(x, mean) +sem(x::AbstractArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) + +function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) + if length(x) ≠ length(weights) + throw(DimensionMismatch("array and weights do not have the same length")) + end + return _sem(x, mean) +end function _sem(x, mean) n = 0 @@ -323,27 +330,16 @@ function _sem(x, ::Nothing) return sqrt(variance / n) end -_sem(x::AbstractArray, mean) = sqrt(var(x; mean=mean, corrected=true) / length(x)) -_sem(x::AbstractArray, ::Nothing) = sqrt(var(x; corrected=true) / length(x)) - -function _sem(x::AbstractArray, weights::UnitWeights, mean) - if length(x) ≠ length(weights) - throw(DimensionMismatch("array and weights do not have the same length")) - end - return sem(x; mean=mean) -end # Weighted methods for the above -_sem(x::AbstractArray, weights::FrequencyWeights, mean) = +sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) -_sem(x::AbstractArray, weights::ProbabilityWeights, ::Nothing) = - _sem(x, weights, mean(x, weights)) - -function _sem(x::AbstractArray, weights::ProbabilityWeights, mean) +function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) + _mean = (mean === nothing) ? mean(x, weights) : mean # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w - return abs2(w * (x_i - mean)) + return abs2(w * (x_i - _mean)) end)) n = count(!iszero, weights) return sqrt(sse * n / (n - 1)) / sum(weights) From f762e37839e9c7f4670ba5bdcd7f50d4624b1f4f Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Thu, 27 Jan 2022 09:01:46 -0800 Subject: [PATCH 44/66] Bug --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 6876081a8..163493667 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -336,7 +336,7 @@ sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) - _mean = (mean === nothing) ? mean(x, weights) : mean + _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w return abs2(w * (x_i - _mean)) From 529cd2104a782371b09176bcd9f234df8475b690 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 29 Jan 2022 10:15:34 -0800 Subject: [PATCH 45/66] Apply code review --- src/scalarstats.jl | 63 +++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 163493667..63dc8031d 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -279,54 +279,55 @@ The standard error is then the square root of the above quantities. Carl-Erik Särndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling. New York: Springer. pp. 51-53. """ -sem(x; mean=nothing) = _sem(x, mean) +function sem(x; mean=nothing) + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + return var(x; corrected=true) + else + return _sem(x, mean) + end +end sem(x::AbstractArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) if length(x) ≠ length(weights) throw(DimensionMismatch("array and weights do not have the same length")) end - return _sem(x, mean) + return sem(x; mean=mean) end function _sem(x, mean) - n = 0 - if isempty(x) - # Return the NaN of the type that we would get for a nonempty x - variance = var(x; mean=mean, corrected=true) - else - sse = real(zero(mean)) - for element in x - n += 1 - sse += abs2(element - mean) - end - variance = sse / (n - 1) + n = 1 + y = iterate(x) + value, state = y + sse = abs2(value - mean) + while (y = iterate(x, state)) !== nothing + value, state = y + n += 1 + sse += abs2(value - mean) end + variance = sse / (n - 1) return sqrt(variance / n) end function _sem(x, ::Nothing) n = 0 - if isempty(x) - # Return the NaN of the type that we would get for a nonempty x - variance = var(x; corrected=true) - else - y = iterate(x) + y = iterate(x) + value, state = y + # Use Welford algorithm as seen in (among other places) + # Knuth's TAOCP, Vol 2, page 232, 3rd edition. + mean = value / 1 + sse = real(zero(mean)) + while y !== nothing value, state = y - # Use Welford algorithm as seen in (among other places) - # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - mean = value / 1 - sse = real(zero(mean)) - while y !== nothing - value, state = y - y = iterate(x, state) - n += 1 - new_mean = mean + (value - mean) / n - sse += realXcY(value - mean, value - new_mean) - mean = new_mean - end - variance = sse / (n - 1) + y = iterate(x, state) + n += 1 + new_mean = mean + (value - mean) / n + sse += realXcY(value - mean, value - new_mean) + mean = new_mean end + variance = sse / (n - 1) + return sqrt(variance / n) end From 8afc950fd4f9457cea45a5cc66c6c3f5e284867e Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 29 Jan 2022 11:16:18 -0800 Subject: [PATCH 46/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 63dc8031d..c98deaaec 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -282,7 +282,7 @@ New York: Springer. pp. 51-53. function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x - return var(x; corrected=true) + return sqrt(var(x; mean=mean, corrected=true) / 0) else return _sem(x, mean) end From 989a383794bde8b47e327c6f317a68cdf13f1a8a Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 29 Jan 2022 12:22:47 -0800 Subject: [PATCH 47/66] Fix tests --- test/scalarstats.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 99863d4a6..f7c879912 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -122,8 +122,11 @@ x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) @test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) -@test sem(Int[]) === NaN -@test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN +if VersionNumber >= v"1.6" + # in earlier versions, Julia throws an error for var(empty) instead of returning NaN. + @test sem(Int[]) === NaN + @test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN +end @test_throws MethodError sem(Any[]) @test_throws MethodError sem(skipmissing([missing])) From b469d6d6ef2788f0042cdfe645eb85a843897604 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 29 Jan 2022 12:46:00 -0800 Subject: [PATCH 48/66] Return NaN for FrequencyWeights --- src/scalarstats.jl | 4 ++++ test/scalarstats.jl | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index c98deaaec..50c71b0a2 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -337,6 +337,10 @@ sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights)) function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + return sqrt(var(x, weights; mean=mean, corrected=true) / 0) + end _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w diff --git a/test/scalarstats.jl b/test/scalarstats.jl index f7c879912..f7d32dcd2 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -122,10 +122,12 @@ x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) @test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) -if VersionNumber >= v"1.6" +if VERSION >= v"1.6" # in earlier versions, Julia throws an error for var(empty) instead of returning NaN. - @test sem(Int[]) === NaN - @test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN + @test isnan(sem(Int[])) + @test isnan(sem(Int[], FrequencyWeights(Int[]))) + @test isnan(sem(Int[], ProbabilityWeights(Int[]))) + @test isnan(sem(skipmissing(Union{Int,Missing}[missing, missing]))) end @test_throws MethodError sem(Any[]) @test_throws MethodError sem(skipmissing([missing])) From e25c4b09424698030c683b966dc551336152d130 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 29 Jan 2022 17:59:50 -0800 Subject: [PATCH 49/66] Type instability fix --- src/scalarstats.jl | 67 ++++++++++++++++++++------------------------- test/scalarstats.jl | 12 ++++---- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 50c71b0a2..14a58929d 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -282,11 +282,39 @@ New York: Springer. pp. 51-53. function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x - return sqrt(var(x; mean=mean, corrected=true) / 0) + T = eltype(x) + return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) + elseif mean === nothing + n = 0 + y = iterate(x) + value, state = y + # Use Welford algorithm as seen in (among other places) + # Knuth's TAOCP, Vol 2, page 232, 3rd edition. + _mean = value / 1 + sse = real(zero(_mean)) + while y !== nothing + value, state = y + y = iterate(x, state) + n += 1 + new_mean = _mean + (value - _mean) / n + sse += realXcY(value - _mean, value - new_mean) + _mean = new_mean + end else - return _sem(x, mean) + n = 1 + y = iterate(x) + value, state = y + sse = abs2(value - mean) + while (y = iterate(x, state)) !== nothing + value, state = y + n += 1 + sse += abs2(value - mean) + end end + variance = sse / (n - 1) + return sqrt(variance / n) end + sem(x::AbstractArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) @@ -296,41 +324,6 @@ function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) return sem(x; mean=mean) end -function _sem(x, mean) - n = 1 - y = iterate(x) - value, state = y - sse = abs2(value - mean) - while (y = iterate(x, state)) !== nothing - value, state = y - n += 1 - sse += abs2(value - mean) - end - variance = sse / (n - 1) - return sqrt(variance / n) -end - -function _sem(x, ::Nothing) - n = 0 - y = iterate(x) - value, state = y - # Use Welford algorithm as seen in (among other places) - # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - mean = value / 1 - sse = real(zero(mean)) - while y !== nothing - value, state = y - y = iterate(x, state) - n += 1 - new_mean = mean + (value - mean) / n - sse += realXcY(value - mean, value - new_mean) - mean = new_mean - end - variance = sse / (n - 1) - - return sqrt(variance / n) -end - # Weighted methods for the above sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = diff --git a/test/scalarstats.jl b/test/scalarstats.jl index f7d32dcd2..12bf05dcc 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -122,13 +122,11 @@ x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) @test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) -if VERSION >= v"1.6" - # in earlier versions, Julia throws an error for var(empty) instead of returning NaN. - @test isnan(sem(Int[])) - @test isnan(sem(Int[], FrequencyWeights(Int[]))) - @test isnan(sem(Int[], ProbabilityWeights(Int[]))) - @test isnan(sem(skipmissing(Union{Int,Missing}[missing, missing]))) -end +# in earlier versions, Julia throws an error for var(empty) instead of returning NaN. +@test isnan(sem(Int[])) +@test isnan(sem(Int[], FrequencyWeights(Int[]))) +@test isnan(sem(Int[], ProbabilityWeights(Int[]))) +@test isnan(sem(skipmissing(Union{Int,Missing}[missing, missing]))) @test_throws MethodError sem(Any[]) @test_throws MethodError sem(skipmissing([missing])) From d56a98fc7d84ab4b5e736970af0d79cd8a3394f5 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 30 Jan 2022 09:04:14 -0800 Subject: [PATCH 50/66] Fix array version to return NaN instead of error --- src/scalarstats.jl | 9 ++++++++- test/scalarstats.jl | 1 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 14a58929d..c56431d75 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -315,7 +315,14 @@ function sem(x; mean=nothing) return sqrt(variance / n) end -sem(x::AbstractArray; mean=nothing) = sqrt(var(x; mean=mean, corrected=true) / length(x)) +function sem(x::AbstractArray; mean=nothing) + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + T = eltype(x) + return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) + end + return sqrt(var(x; mean=mean, corrected=true) / length(x)) +end function sem(x::AbstractArray, weights::UnitWeights; mean=nothing) if length(x) ≠ length(weights) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 12bf05dcc..769cef34d 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -122,7 +122,6 @@ x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) @test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) -# in earlier versions, Julia throws an error for var(empty) instead of returning NaN. @test isnan(sem(Int[])) @test isnan(sem(Int[], FrequencyWeights(Int[]))) @test isnan(sem(Int[], ProbabilityWeights(Int[]))) From 22c8cc054fe726e65f07bee2b0452e7c799797eb Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sun, 30 Jan 2022 09:07:45 -0800 Subject: [PATCH 51/66] Remove useless tests --- test/scalarstats.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 769cef34d..7479e60ee 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -115,9 +115,6 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] μ = mean(1:5, ProbabilityWeights([1:5;])) @test sem([1:5;], ProbabilityWeights([1:5;]); mean=μ) ≈ 0.6166 rtol=.001 @test sem([10; 1:5;], ProbabilityWeights([0; 1:5;]); mean=μ) ≈ 0.6166 rtol=.001 -@test sem([1:100;], ProbabilityWeights([1:100;])) ≈ 2.452452 rtol=.001 -μ = mean([1:100;], ProbabilityWeights([1:100;])) -@test sem([1:100;], ProbabilityWeights([1:100;]); mean=μ) ≈ 2.452452 rtol=.001 x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) @test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) From 6b90e77999ba25bab94719baf7bd33b7fe2e2d8c Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Mon, 31 Jan 2022 11:15:19 -0800 Subject: [PATCH 52/66] Update src/scalarstats.jl Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index c56431d75..36b6fcaa4 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -316,10 +316,12 @@ function sem(x; mean=nothing) end function sem(x::AbstractArray; mean=nothing) - if isempty(x) - # Return the NaN of the type that we would get for a nonempty x - T = eltype(x) - return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) + @static if VERSION < v"1.6" + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + T = eltype(x) + return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) + end end return sqrt(var(x; mean=mean, corrected=true) / length(x)) end From f3c9714bb5c6d076276a2e9b2271ec1dadb9ba23 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 1 Feb 2022 09:50:59 +0100 Subject: [PATCH 53/66] Update src/scalarstats.jl --- src/scalarstats.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 36b6fcaa4..f3a8bf3ad 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -264,8 +264,11 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) sem(x; mean=nothing) sem(x::AbstractArray[, weights::AbstractWeights]; mean=nothing) -Return the standard error of the mean for a collection `x`. When not using weights, this is -the (sample) standard deviation divided by the sample size. If weights are used, the +Return the standard error of the mean for a collection `x`. +A pre-computed `mean` may be provided. + +When not using weights, this is the (sample) standard deviation +divided by the sample size. If weights are used, the variance of the sample mean is calculated as follows: * `AnalyticWeights`: Not implemented. From cfba9a1714155ac5a76098ade9759cdc38c9c5ee Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 1 Feb 2022 10:05:16 +0100 Subject: [PATCH 54/66] Update test/scalarstats.jl --- test/scalarstats.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 7479e60ee..67fb84020 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -109,6 +109,7 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem([1:5;]) ≈ 0.707106781186548 @test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548 +@test sem(skipmissing([missing; 1:5; missing]), mean=3.0) ≈ 0.707106781186548 @test sem([1:5;], UnitWeights{Int}(5)) ≈ 0.707106781186548 @test sem([1:5;], UnitWeights{Int}(5); mean=mean(1:5)) ≈ 0.707106781186548 @test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.001 From d024475bd280dac65b5e19d0d4624809e121f052 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 11:52:24 -0800 Subject: [PATCH 55/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index f3a8bf3ad..c47746e3f 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -323,7 +323,9 @@ function sem(x::AbstractArray; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x T = eltype(x) - return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) + _mean = mean === nothing ? zero(T) / 1 : mean + z = abs2(zero(T) - _mean) + return oftype((z + z) / 2, NaN) end end return sqrt(var(x; mean=mean, corrected=true) / length(x)) From b64d44d6e18cc1873425aaf2e680066be8c931ae Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 11:54:35 -0800 Subject: [PATCH 56/66] Add test --- test/scalarstats.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 67fb84020..052717134 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -112,6 +112,7 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test sem(skipmissing([missing; 1:5; missing]), mean=3.0) ≈ 0.707106781186548 @test sem([1:5;], UnitWeights{Int}(5)) ≈ 0.707106781186548 @test sem([1:5;], UnitWeights{Int}(5); mean=mean(1:5)) ≈ 0.707106781186548 +@test_throws DimensionMismatch sem(1:5, UnitWeights{Int}(4)) @test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.001 μ = mean(1:5, ProbabilityWeights([1:5;])) @test sem([1:5;], ProbabilityWeights([1:5;]); mean=μ) ≈ 0.6166 rtol=.001 From e31748abc13c6bc1219bfa88f388c849837ff552 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 11:55:59 -0800 Subject: [PATCH 57/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index c47746e3f..2a61de8e0 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -286,7 +286,9 @@ function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x T = eltype(x) - return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) + _mean = mean === nothing ? zero(T) / 1 : mean + z = abs2(zero(T) - _mean) + return oftype((z + z) / 2, NaN) elseif mean === nothing n = 0 y = iterate(x) From 72d1e482eb65a93edc1e587522615169a55b88b1 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 11:57:00 -0800 Subject: [PATCH 58/66] Fix type instability --- src/scalarstats.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 2a61de8e0..bbfddccb1 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -348,7 +348,10 @@ sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x - return sqrt(var(x, weights; mean=mean, corrected=true) / 0) + T = eltype(x) + _mean = mean === nothing ? zero(T) / 1 : mean + z = abs2(zero(T) - _mean) + return oftype((z + z) / 2, NaN) end _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean # sum of squared errors = sse From 6ceeac4678973d99aaab620110403663fde2b328 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 11:57:50 -0800 Subject: [PATCH 59/66] Bug fix --- src/scalarstats.jl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index bbfddccb1..a41d3a653 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -321,14 +321,12 @@ function sem(x; mean=nothing) end function sem(x::AbstractArray; mean=nothing) - @static if VERSION < v"1.6" - if isempty(x) - # Return the NaN of the type that we would get for a nonempty x - T = eltype(x) + if isempty(x) + # Return the NaN of the type that we would get for a nonempty x + T = eltype(x) _mean = mean === nothing ? zero(T) / 1 : mean z = abs2(zero(T) - _mean) return oftype((z + z) / 2, NaN) - end end return sqrt(var(x; mean=mean, corrected=true) / length(x)) end From 577914444b120ce9c99b72050c7717aab80f900a Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 12:25:38 -0800 Subject: [PATCH 60/66] Test for type stability --- src/scalarstats.jl | 15 ++++++++------- test/scalarstats.jl | 27 +++++++++++++++++---------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index a41d3a653..2493e77c1 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -350,14 +350,15 @@ function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) _mean = mean === nothing ? zero(T) / 1 : mean z = abs2(zero(T) - _mean) return oftype((z + z) / 2, NaN) + else + _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean + # sum of squared errors = sse + sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w + return abs2(w * (x_i - _mean)) + end)) + n = count(!iszero, weights) + return sqrt(sse * n / (n - 1)) / sum(weights) end - _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean - # sum of squared errors = sse - sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w - return abs2(w * (x_i - _mean)) - end)) - n = count(!iszero, weights) - return sqrt(sse * n / (n - 1)) / sum(weights) end # Median absolute deviation diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 052717134..e5c3b0b66 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -107,20 +107,27 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test variation([1:5;]) ≈ 0.527046276694730 @test variation(skipmissing([missing; 1:5; missing])) ≈ 0.527046276694730 -@test sem([1:5;]) ≈ 0.707106781186548 -@test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548 -@test sem(skipmissing([missing; 1:5; missing]), mean=3.0) ≈ 0.707106781186548 -@test sem([1:5;], UnitWeights{Int}(5)) ≈ 0.707106781186548 -@test sem([1:5;], UnitWeights{Int}(5); mean=mean(1:5)) ≈ 0.707106781186548 +@test @inferred(sem([1:5;])) ≈ 0.707106781186548 +@test @inferred(sem(skipmissing([missing; 1:5; missing]))) ≈ 0.707106781186548 +@test @inferred(sem(skipmissing([missing; 1:5; missing]), mean=3.0)) ≈ 0.707106781186548 +@test @inferred(sem([1:5;], UnitWeights{Int}(5))) ≈ 0.707106781186548 +@test @inferred(sem([1:5;], UnitWeights{Int}(5); mean=mean(1:5))) ≈ 0.707106781186548 @test_throws DimensionMismatch sem(1:5, UnitWeights{Int}(4)) -@test sem([1:5;], ProbabilityWeights([1:5;])) ≈ 0.6166 rtol=.001 +@test @inferred(sem([1:5;], ProbabilityWeights([1:5;]))) ≈ 0.6166 rtol=.001 μ = mean(1:5, ProbabilityWeights([1:5;])) -@test sem([1:5;], ProbabilityWeights([1:5;]); mean=μ) ≈ 0.6166 rtol=.001 -@test sem([10; 1:5;], ProbabilityWeights([0; 1:5;]); mean=μ) ≈ 0.6166 rtol=.001 +@test @inferred(sem([1:5;], ProbabilityWeights([1:5;]); mean=μ)) ≈ 0.6166 rtol=.001 +@test @inferred(sem([10; 1:5;], ProbabilityWeights([0; 1:5;]); mean=μ)) ≈ 0.6166 rtol=.001 x = sort!(vcat([5:-1:i for i in 1:5]...)) μ = mean(x) -@test sem([1:5;], FrequencyWeights([1:5;])) ≈ sem(x) -@test sem([1:5;], FrequencyWeights([1:5;]); mean=μ) ≈ sem(x) +@test @inferred(sem([1:5;], FrequencyWeights([1:5;]))) ≈ sem(x) +@test @inferred(sem([1:5;], FrequencyWeights([1:5;]); mean=μ)) ≈ sem(x) + +@inferred sem([1:5f0;]; mean=μ) ≈ sem(x) +@inferred sem([1:5f0;], ProbabilityWeights([1:5;]); mean=μ) +@inferred sem([1:5f0;], FrequencyWeights([1:5;]); mean=μ) +# Broken: Bug to do with implementation of +# @inferred sem([1:5f0;], UnitWeights{Int}(5); mean=μ) + @test isnan(sem(Int[])) @test isnan(sem(Int[], FrequencyWeights(Int[]))) @test isnan(sem(Int[], ProbabilityWeights(Int[]))) From a520d2049f1d257a3302dec78fa2f3a16ffea39e Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Wed, 2 Feb 2022 19:14:37 -0800 Subject: [PATCH 61/66] Fix type instability --- src/scalarstats.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 2493e77c1..164c6221e 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -286,7 +286,7 @@ function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x T = eltype(x) - _mean = mean === nothing ? zero(T) / 1 : mean + _mean = (mean === nothing) ? zero(T) / 1 : mean z = abs2(zero(T) - _mean) return oftype((z + z) / 2, NaN) elseif mean === nothing @@ -324,7 +324,7 @@ function sem(x::AbstractArray; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x T = eltype(x) - _mean = mean === nothing ? zero(T) / 1 : mean + _mean = (mean === nothing) ? zero(T) / 1 : mean z = abs2(zero(T) - _mean) return oftype((z + z) / 2, NaN) end @@ -346,10 +346,7 @@ sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x - T = eltype(x) - _mean = mean === nothing ? zero(T) / 1 : mean - z = abs2(zero(T) - _mean) - return oftype((z + z) / 2, NaN) + return var(x, weights; mean=nothing, corrected=true) / 0 else _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean # sum of squared errors = sse From 7b40f1c9a20e1ee66567035e685158e0555dc9f1 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Thu, 3 Feb 2022 08:38:27 -0800 Subject: [PATCH 62/66] mean type fix --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 164c6221e..71ba8a54a 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -346,7 +346,7 @@ sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) = function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x - return var(x, weights; mean=nothing, corrected=true) / 0 + return var(x, weights; mean=mean, corrected=true) / 0 else _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean # sum of squared errors = sse From 4683ec4fa1aa06da0ef3f1ae30191b36b2eb2115 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Thu, 3 Feb 2022 09:33:03 -0800 Subject: [PATCH 63/66] test for type inference --- test/scalarstats.jl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index e5c3b0b66..4d5054918 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -125,13 +125,18 @@ x = sort!(vcat([5:-1:i for i in 1:5]...)) @inferred sem([1:5f0;]; mean=μ) ≈ sem(x) @inferred sem([1:5f0;], ProbabilityWeights([1:5;]); mean=μ) @inferred sem([1:5f0;], FrequencyWeights([1:5;]); mean=μ) -# Broken: Bug to do with implementation of +# Broken: Bug to do with Statistics.jl's implementation of `var` # @inferred sem([1:5f0;], UnitWeights{Int}(5); mean=μ) -@test isnan(sem(Int[])) -@test isnan(sem(Int[], FrequencyWeights(Int[]))) -@test isnan(sem(Int[], ProbabilityWeights(Int[]))) -@test isnan(sem(skipmissing(Union{Int,Missing}[missing, missing]))) +@test @inferred(isnan(sem(Int[]))) +@test @inferred(isnan(sem(Int[], FrequencyWeights(Int[])))) +@test @inferred(isnan(sem(Int[], ProbabilityWeights(Int[])))) + +@test @inferred(isnan(sem(Int[]; mean=0f0))) +@test @inferred(isnan(sem(Int[], FrequencyWeights(Int[]); mean=0f0))) +@test @inferred(isnan(sem(Int[], ProbabilityWeights(Int[]); mean=0f0))) + +@test @inferred(isnan(sem(skipmissing(Union{Int,Missing}[missing, missing])))) @test_throws MethodError sem(Any[]) @test_throws MethodError sem(skipmissing([missing])) From ab29aa706e39407ceca7bbe8ac95be71d6ea9de7 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 5 Feb 2022 07:47:15 -0800 Subject: [PATCH 64/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 71ba8a54a..53f69e878 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -286,7 +286,7 @@ function sem(x; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x T = eltype(x) - _mean = (mean === nothing) ? zero(T) / 1 : mean + _mean = mean === nothing ? zero(T) / 1 : mean z = abs2(zero(T) - _mean) return oftype((z + z) / 2, NaN) elseif mean === nothing From 1f32003f659dff59b612e08814df79c46f8d4d61 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 5 Feb 2022 07:47:25 -0800 Subject: [PATCH 65/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 53f69e878..0fe116190 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -324,7 +324,7 @@ function sem(x::AbstractArray; mean=nothing) if isempty(x) # Return the NaN of the type that we would get for a nonempty x T = eltype(x) - _mean = (mean === nothing) ? zero(T) / 1 : mean + _mean = mean === nothing ? zero(T) / 1 : mean z = abs2(zero(T) - _mean) return oftype((z + z) / 2, NaN) end From 632e2121dd3cb2c8003f45e5527c9c0efae57f35 Mon Sep 17 00:00:00 2001 From: Carlos Parada Date: Sat, 5 Feb 2022 07:47:31 -0800 Subject: [PATCH 66/66] Update src/scalarstats.jl Co-authored-by: David Widmann --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 0fe116190..1e4406c8a 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -348,7 +348,7 @@ function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing) # Return the NaN of the type that we would get for a nonempty x return var(x, weights; mean=mean, corrected=true) / 0 else - _mean = (mean === nothing) ? Statistics.mean(x, weights) : mean + _mean = mean === nothing ? Statistics.mean(x, weights) : mean # sum of squared errors = sse sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w return abs2(w * (x_i - _mean))