From 574827c3ece6e92813ff4fa99138c7bfdc532c48 Mon Sep 17 00:00:00 2001 From: Praneya Kumar Date: Sat, 5 Apr 2025 16:25:38 +0530 Subject: [PATCH 1/3] Add Bartlett's and Levene's tests for homogeneity of variances --- Statistics/Test/Bartlett.hs | 92 +++++++++++++++++++++++ Statistics/Test/Levene.hs | 144 ++++++++++++++++++++++++++++++++++++ changelog.md | 10 +++ statistics.cabal | 2 + 4 files changed, 248 insertions(+) create mode 100644 Statistics/Test/Bartlett.hs create mode 100644 Statistics/Test/Levene.hs diff --git a/Statistics/Test/Bartlett.hs b/Statistics/Test/Bartlett.hs new file mode 100644 index 0000000..1443bb7 --- /dev/null +++ b/Statistics/Test/Bartlett.hs @@ -0,0 +1,92 @@ +{-# LANGUAGE FlexibleContexts #-} +{-| +Module : Statistics.Test.Bartlett +Description : Bartlett's test for homogeneity of variances. +Copyright : (c) Praneya Kumar, 2025 +License : BSD-3-Clause + +Implements Bartlett's test to check if multiple groups have equal variances. +Assesses equality of variances assuming normal distribution, sensitive to non-normality. +-} +module Statistics.Test.Bartlett ( + bartlettTest, + module Statistics.Distribution.ChiSquared +) where + +import qualified Data.Vector.Generic as G +import qualified Data.Vector.Unboxed as U +import Statistics.Distribution (cumulative) +import Statistics.Distribution.ChiSquared (chiSquared, ChiSquared(..)) +import Statistics.Sample (varianceUnbiased) +import Statistics.Types (mkPValue) +import Statistics.Test.Types (Test(..)) + +-- | Perform Bartlett's test for equal variances. +-- The input is a list of vectors, where each vector represents a group of observations. +-- Returns Either an error message or a Test ChiSquared containing the test statistic and p-value. +bartlettTest :: [U.Vector Double] -> Either String (Test ChiSquared) +bartlettTest groups + | length groups < 2 = Left "At least two groups are required for Bartlett's test." + | any ((< 2) . G.length) groups = Left "Each group must have at least two observations." + | any (<= 0) groupVariances = Left "All groups must have positive variance." + | otherwise = Right $ Test + { testSignificance = pValue + , testStatistics = tStatistic + , testDistribution = chiDist + } + where + -- Number of groups + k = length groups + + -- Sample sizes for each group + ni = map G.length groups + ni' = map fromIntegral ni + + -- Total number of observations across all groups + nTotal = sum ni + + -- Variance for each group (unbiased estimate) + groupVariances = map varianceUnbiased groups + + -- Pooled variance calculation + sumWeightedVars = sum [ (n - 1) * v | (n, v) <- zip ni' groupVariances ] + pooledVariance = sumWeightedVars / fromIntegral (nTotal - k) + + -- Numerator of Bartlett's statistic + numerator = + fromIntegral (nTotal - k) * log pooledVariance - + sum [ (n - 1) * log v | (n, v) <- zip ni' groupVariances ] + + -- Denominator correction term + sumReciprocals = sum [1 / (n - 1) | n <- ni'] + denomCorrection = + 1 + (sumReciprocals - 1 / fromIntegral (nTotal - k)) / (3 * (fromIntegral k - 1)) + + -- Test statistic T + tStatistic = max 0 $ numerator / denomCorrection + + -- Degrees of freedom and chi-squared distribution + df = k - 1 + chiDist = chiSquared df + pValue = mkPValue $ 1 - cumulative chiDist tStatistic + + +-- Example usage: +-- import qualified Data.Vector.Unboxed as U +-- import Statistics.Test.Bartlett + +-- main :: IO () +-- main = do +-- let a = U.fromList [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] +-- b = U.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] +-- c = U.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] + +-- case bartlettTest [a,b,c] of +-- Left err -> putStrLn $ "Error: " ++ err +-- Right test -> do +-- putStrLn $ "Bartlett's Test Statistic: " ++ show (testStatistics test) +-- putStrLn $ "P-Value: " ++ show (testSignificance test) + +-- Sample Output +-- Bartlett's Test Statistic: ~32 +-- P-Value: ~1e-5 diff --git a/Statistics/Test/Levene.hs b/Statistics/Test/Levene.hs new file mode 100644 index 0000000..022f219 --- /dev/null +++ b/Statistics/Test/Levene.hs @@ -0,0 +1,144 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE BangPatterns #-} +{-| +Module : Statistics.Test.Levene +Description : Levene's test for homogeneity of variances. +Copyright : (c) Praneya Kumar, 2025 +License : BSD-3-Clause + +Implements Levene's test to check if multiple groups have equal variances. +Assesses equality of variances, robust to non-normality, and versatile with mean or median centering. +-} +module Statistics.Test.Levene ( + Center(..), + TestResult(..), + levenesTest +) where + +import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as U +import qualified Data.Vector.Algorithms.Merge as VA +import Statistics.Distribution +import Statistics.Distribution.FDistribution (fDistribution) +import qualified Statistics.Sample as Sample +import Control.Exception (assert) + +-- | Center calculation method with proper documentation +data Center = + Mean -- ^ Use arithmetic mean + | Median -- ^ Use median + | Trimmed Double -- ^ Trimmed mean with given proportion to cut from each end + deriving (Eq, Show) + +-- | Result type with comprehensive information +data TestResult = TestResult { + wStatistic :: Double -- ^ Levene's W statistic + , pValue :: Double -- ^ Associated p-value + , rejectNull :: Bool -- ^ Whether to reject null hypothesis at given alpha + } deriving (Eq, Show) + +-- | Trim data from both ends with error handling and performance optimization +trimboth :: (Ord a, Fractional a) + => V.Vector a + -> Double + -> Either String (V.Vector a) +trimboth vec prop + | prop < 0 || prop > 1 = Left "Proportion must be between 0 and 1" + | V.null vec = Right vec + | otherwise = do + let sorted = V.modify VA.sort vec + n = V.length sorted + lowerCut = floor (prop * fromIntegral n) + upperCut = n - lowerCut + assert (upperCut >= lowerCut) $ + Right $ V.slice lowerCut (upperCut - lowerCut) sorted + +-- | Calculate median using pre-sorted vector +vectorMedian :: (Fractional a, Ord a) + => V.Vector a + -> Either String a +vectorMedian vec + | V.null vec = Left "Empty vector in median calculation" + | otherwise = Right $ + if odd len + then sorted V.! mid + else (sorted V.! (mid - 1) + sorted V.! mid) / 2 + where + sorted = V.modify VA.sort vec + len = V.length sorted + mid = len `div` 2 + +-- | Main Levene's test function with full error handling +levenesTest :: Double -- ^ Significance level (alpha) + -> Center -- ^ Center calculation method + -> [V.Vector Double] -- ^ Input samples + -> Either String TestResult +levenesTest alpha center samples + | alpha < 0 || alpha > 1 = Left "Significance level must be between 0 and 1" + | length samples < 2 = Left "At least two samples required" + | otherwise = do + processed <- mapM processSample samples + let (deviations, ni) = unzip processed + k = length samples + n = sum ni + zbari = map Sample.mean deviations + zbar = sum (zipWith (*) zbari (map fromIntegral ni)) / fromIntegral n + + numerator = sum [fromIntegral (ni !! i) * (zbari !! i - zbar) ** 2 | i <- [0..k-1]] + denominator = sum [U.sum (U.map (\x -> (x - zbari !! i) ** 2) (deviations !! i)) | i <- [0..k-1]] + + -- Handle division by zero and invalid values + if denominator <= 0 || isNaN denominator || isInfinite denominator + then Left "Invalid denominator in W-statistic calculation" + else do + let wStat = (fromIntegral (n - k) / fromIntegral (k - 1)) * (numerator / denominator) + df1 = k - 1 + df2 = n - k + pVal = 1 - cumulative (fDistribution df1 df2) wStat + reject = pVal < alpha + + -- Validate distribution parameters + if df1 < 1 || df2 < 1 + then Left "Invalid degrees of freedom" + else Right $ TestResult wStat pVal reject + where + -- Process samples with error handling and optimized sorting + processSample vec = case center of + Mean -> do + let dev = V.map (abs . subtract (Sample.mean vec)) vec + return (U.convert dev, V.length vec) + + Median -> do + sortedVec <- Right $ V.modify VA.sort vec + m <- vectorMedian sortedVec + let dev = V.map (abs . subtract m) sortedVec + return (U.convert dev, V.length vec) + + Trimmed p -> do + trimmed <- trimboth vec p + let c = Sample.mean trimmed + dev = V.map (abs . subtract c) trimmed + return (U.convert dev, V.length trimmed) + + +-- Example usage: +-- import qualified Data.Vector as V +-- import LevenesTest (Center(..), levenesTest, TestResult(..)) + +-- main :: IO () +-- main = do +-- let a = V.fromList [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] +-- b = V.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] +-- c = V.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] + +-- case levenesTest 0.05 (Trimmed 0.1) [a, b, c] of +-- Left err -> putStrLn $ "Error: " ++ err +-- Right test -> do +-- putStrLn $ "Levene's W Statistic: " ++ show (wStatistic test) +-- putStrLn $ "P-Value: " ++ show (pValue test) +-- putStrLn $ "Reject null hypothesis at α=0.05: " ++ show (rejectNull test) + +-- Sample Output +-- Levene's W Statistic: 0.852 +-- P-Value: 0.432 +-- Reject null hypothesis at α=0.05: False diff --git a/changelog.md b/changelog.md index 35a9bb6..4ca84ff 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,13 @@ +## Unreleased + +- **New Features**: + - Implemented Bartlett's test (`Statistics.Test.Bartlett`) for homogeneity of variances. + - Implemented Levene's test (`Statistics.Test.Levene`) for homogeneity of variances. + - Resolves [#137](https://github.com/haskell/statistics/issues/137). + +- **Documentation**: + - Added usage examples and Haddock comments for both tests. + ## Changes in 0.16.3.0 * `S.Sample.correlation`, `S.Sample.covariance`, diff --git a/statistics.cabal b/statistics.cabal index 1d933ec..8f5620e 100644 --- a/statistics.cabal +++ b/statistics.cabal @@ -105,6 +105,8 @@ library Statistics.Sample.KernelDensity.Simple Statistics.Sample.Normalize Statistics.Sample.Powers + Statistics.Test.Bartlett + Statistics.Test.Levene Statistics.Test.ChiSquared Statistics.Test.KolmogorovSmirnov Statistics.Test.KruskalWallis From 92565bc5fb3ed69a1117bea233f00aae8a4f6254 Mon Sep 17 00:00:00 2001 From: Praneya Kumar Date: Sun, 13 Apr 2025 15:51:56 +0530 Subject: [PATCH 2/3] Using vector instead of lists --- Statistics/Test/Levene.hs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Statistics/Test/Levene.hs b/Statistics/Test/Levene.hs index 022f219..2700603 100644 --- a/Statistics/Test/Levene.hs +++ b/Statistics/Test/Levene.hs @@ -78,14 +78,19 @@ levenesTest alpha center samples | length samples < 2 = Left "At least two samples required" | otherwise = do processed <- mapM processSample samples - let (deviations, ni) = unzip processed - k = length samples - n = sum ni - zbari = map Sample.mean deviations - zbar = sum (zipWith (*) zbari (map fromIntegral ni)) / fromIntegral n + let (deviationsList, niList) = unzip processed + deviations = V.fromList deviationsList -- V.Vector (U.Vector Double) + ni = V.fromList niList -- V.Vector Int + zbari = V.map Sample.mean deviations -- V.Vector Double + k = V.length deviations + n = V.sum ni + zbar = V.sum (V.zipWith (\z n' -> z * fromIntegral n') zbari ni) / fromIntegral n - numerator = sum [fromIntegral (ni !! i) * (zbari !! i - zbar) ** 2 | i <- [0..k-1]] - denominator = sum [U.sum (U.map (\x -> (x - zbari !! i) ** 2) (deviations !! i)) | i <- [0..k-1]] + -- Numerator: Sum over (ni * (zbari - zbar)^2) + numerator = V.sum $ V.zipWith (\n z -> fromIntegral n * (z - zbar) ** 2) ni zbari + + -- Denominator: Sum over sum((dev_ij - zbari)^2) + denominator = V.sum $ V.zipWith (\dev z -> U.sum (U.map (\x -> (x - z) ** 2) dev)) deviations zbari -- Handle division by zero and invalid values if denominator <= 0 || isNaN denominator || isInfinite denominator From fc1b002b5a3934713676186f1fd92ae1d50e7818 Mon Sep 17 00:00:00 2001 From: Praneya Kumar Date: Wed, 21 May 2025 13:43:49 +0530 Subject: [PATCH 3/3] Changed return type for Levene Test and added simple unit tests for Bartlett and Levene Tests --- Statistics/Test/Levene.hs | 71 ++++++++++++++++++++------------------- tests/Tests/Parametric.hs | 62 +++++++++++++++++++++++++++++++--- 2 files changed, 95 insertions(+), 38 deletions(-) diff --git a/Statistics/Test/Levene.hs b/Statistics/Test/Levene.hs index 2700603..ffa545c 100644 --- a/Statistics/Test/Levene.hs +++ b/Statistics/Test/Levene.hs @@ -1,5 +1,5 @@ {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE BangPatterns #-} + {-| Module : Statistics.Test.Levene Description : Levene's test for homogeneity of variances. @@ -11,32 +11,26 @@ Assesses equality of variances, robust to non-normality, and versatile with mean -} module Statistics.Test.Levene ( Center(..), - TestResult(..), levenesTest ) where import qualified Data.Vector as V import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Algorithms.Merge as VA -import Statistics.Distribution -import Statistics.Distribution.FDistribution (fDistribution) +import Statistics.Distribution (cumulative) +import Statistics.Distribution.FDistribution (fDistribution, FDistribution) +import Statistics.Types (mkPValue) +import Statistics.Test.Types (Test(..)) import qualified Statistics.Sample as Sample import Control.Exception (assert) --- | Center calculation method with proper documentation +-- | Center calculation method data Center = - Mean -- ^ Use arithmetic mean - | Median -- ^ Use median - | Trimmed Double -- ^ Trimmed mean with given proportion to cut from each end + Mean -- ^ Use arithmetic mean + | Median -- ^ Use median + | Trimmed Double -- ^ Trimmed mean with given proportion to cut from each end deriving (Eq, Show) --- | Result type with comprehensive information -data TestResult = TestResult { - wStatistic :: Double -- ^ Levene's W statistic - , pValue :: Double -- ^ Associated p-value - , rejectNull :: Bool -- ^ Whether to reject null hypothesis at given alpha - } deriving (Eq, Show) - -- | Trim data from both ends with error handling and performance optimization trimboth :: (Ord a, Fractional a) => V.Vector a @@ -69,10 +63,10 @@ vectorMedian vec mid = len `div` 2 -- | Main Levene's test function with full error handling -levenesTest :: Double -- ^ Significance level (alpha) - -> Center -- ^ Center calculation method - -> [V.Vector Double] -- ^ Input samples - -> Either String TestResult +levenesTest :: Double -- ^ Significance level (alpha) + -> Center -- ^ Centering method + -> [V.Vector Double] -- ^ Input samples + -> Either String (Test FDistribution) levenesTest alpha center samples | alpha < 0 || alpha > 1 = Left "Significance level must be between 0 and 1" | length samples < 2 = Left "At least two samples required" @@ -99,13 +93,17 @@ levenesTest alpha center samples let wStat = (fromIntegral (n - k) / fromIntegral (k - 1)) * (numerator / denominator) df1 = k - 1 df2 = n - k - pVal = 1 - cumulative (fDistribution df1 df2) wStat - reject = pVal < alpha + fDist = fDistribution df1 df2 + pVal = mkPValue $ 1 - cumulative fDist wStat -- Validate distribution parameters if df1 < 1 || df2 < 1 then Left "Invalid degrees of freedom" - else Right $ TestResult wStat pVal reject + else Right $ Test + { testStatistics = wStat + , testSignificance = pVal + , testDistribution = fDist + } where -- Process samples with error handling and optimized sorting processSample vec = case center of @@ -120,15 +118,19 @@ levenesTest alpha center samples return (U.convert dev, V.length vec) Trimmed p -> do - trimmed <- trimboth vec p - let c = Sample.mean trimmed - dev = V.map (abs . subtract c) trimmed - return (U.convert dev, V.length trimmed) + trimmed_for_center_calculation <- trimboth vec p + let robust_center = Sample.mean trimmed_for_center_calculation + -- Calculate deviations for ALL ORIGINAL points from the robust_center + deviations_from_robust_center = V.map (abs . subtract robust_center) vec -- Use 'vec' (original data) + -- Return deviations and the ORIGINAL sample size + return (U.convert deviations_from_robust_center, V.length vec) -- Use 'V.length vec' -- Example usage: -- import qualified Data.Vector as V --- import LevenesTest (Center(..), levenesTest, TestResult(..)) +-- import LevenesTest (Center(..), levenesTest) +-- import Statistics.Test.Types (testStatistics, testSignificance) +-- import Statistics.Types (pValue) -- main :: IO () -- main = do @@ -136,14 +138,15 @@ levenesTest alpha center samples -- b = V.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] -- c = V.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] --- case levenesTest 0.05 (Trimmed 0.1) [a, b, c] of +-- case levenesTest (Trimmed 0.05) [a, b, c] of -- Left err -> putStrLn $ "Error: " ++ err -- Right test -> do --- putStrLn $ "Levene's W Statistic: " ++ show (wStatistic test) --- putStrLn $ "P-Value: " ++ show (pValue test) --- putStrLn $ "Reject null hypothesis at α=0.05: " ++ show (rejectNull test) +-- putStrLn $ "Levene's W Statistic: " ++ show (testStatistics test) +-- putStrLn $ "P-Value: " ++ show (pValue (testSignificance test)) +-- putStrLn $ "Reject null hypothesis at α=0.05: " ++ show (testSignificance test < 0.05) + -- Sample Output --- Levene's W Statistic: 0.852 --- P-Value: 0.432 --- Reject null hypothesis at α=0.05: False +-- Levene's W Statistic: 7.905 +-- P-Value: 0.002 +-- Reject null hypothesis at α=0.05: True diff --git a/tests/Tests/Parametric.hs b/tests/Tests/Parametric.hs index e2ffd9b..05f04ab 100644 --- a/tests/Tests/Parametric.hs +++ b/tests/Tests/Parametric.hs @@ -4,12 +4,18 @@ import Data.Maybe (fromJust) import Statistics.Test.StudentT import Statistics.Types import qualified Data.Vector.Unboxed as U -import Test.Tasty (testGroup) -import Tests.Helpers (testEquality) +import qualified Data.Vector as V +import Test.Tasty (testGroup, TestTree) +import Test.Tasty.HUnit (testCase, assertBool) +import Tests.Helpers (testEquality) import qualified Test.Tasty as Tst +import Statistics.Test.Levene +import Statistics.Test.Bartlett + + tests :: Tst.TestTree -tests = testGroup "Parametric tests" studentTTests +tests = testGroup "Parametric tests" (studentTTests ++ bartlettTests ++ leveneTests) -- 2 samples x 20 obs data -- @@ -77,7 +83,7 @@ testTTest name pVal test = , testEquality name (isSignificant (mkPValue $ pValue pVal + 1e-5) test) Significant ] - + studentTTests :: [Tst.TestTree] studentTTests = concat [ -- R: t.test(sample1, sample2, alt="two.sided", var.equal=T) @@ -100,3 +106,51 @@ studentTTests = concat (mkPValue 0.01705) (fromJust $ pairedTTest BGreater sample12) ] where sample12 = U.zip sample1 sample2 + + +------------------------------------------------------------ +-- Bartlett's Test +------------------------------------------------------------ + +bartlettTests :: [TestTree] +bartlettTests = + let + a = U.fromList [9.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] + b = U.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 9.36, 9.18, 8.67, 9.05] + c = U.fromList [8.95, 8.12, 8.95, 8.85, 8.03, 8.84, 8.07, 8.98, 8.86, 8.98] + result = bartlettTest [a, b, c] + in case result of + Left err -> [testCase "Bartlett test - failed" (assertBool err False)] + Right test -> + [ testApproxEqual "Bartlett's Chi-Square Statistic" 1e-3 (testStatistics test) 1.802 + , testApproxEqual "Bartlett's P-Value" 1e-3 (pValue $ testSignificance test) 0.406 + , testEquality "Reject null hypothesis at alpha = 0.05" + (isSignificant (mkPValue 0.05) test) NotSignificant + ] + +------------------------------------------------------------ +-- Levene's Test (Trimmed Mean) +------------------------------------------------------------ + +-- Local helper for approximate equality +testApproxEqual :: String -> Double -> Double -> Double -> TestTree +testApproxEqual name epsilon actual expected = + testCase name $ + let diff = abs (actual - expected) + in assertBool (name ++ ": expected ≈ " ++ show expected ++ ", got " ++ show actual) (diff < epsilon) + +leveneTests :: [TestTree] +leveneTests = + let + a = V.fromList [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] + b = V.fromList [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] + c = V.fromList [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] + result = levenesTest 0.05 (Trimmed 0.05) [a, b, c] + in case result of + Left err -> [testCase "Levene trimmed mean test - failed" (assertBool err False)] + Right test -> + [ testApproxEqual "Levene's W Statistic (Trimmed 0.05)" 1e-3 (testStatistics test) 7.905 + , testApproxEqual "Levene's P-Value (Trimmed 0.05)" 1e-3 (pValue $ testSignificance test) 0.002 + , testEquality "Reject null hypothesis at alpha = 0.05" + (isSignificant (mkPValue 0.05) test) Significant + ]