diff --git a/docs/reference/databases.md b/docs/reference/databases.md
index 7024a15..cad516c 100644
--- a/docs/reference/databases.md
+++ b/docs/reference/databases.md
@@ -9,3 +9,9 @@ filters: ["!ISDAccessor", "!_"]
 ## Soundscape Attributes Translation Project (SATP)
 
 ::: soundscapy.databases.satp
+
+## SATP Translation Testing
+
+::: soundscapy.databases.satp_testing
+options:
+filters: ["!_"]
diff --git a/src/soundscapy/databases/__init__.py b/src/soundscapy/databases/__init__.py
index f5310b2..995dedf 100644
--- a/src/soundscapy/databases/__init__.py
+++ b/src/soundscapy/databases/__init__.py
@@ -2,9 +2,10 @@
 Soundscapy Databases Module.
 
 This module handles connections to and operations on soundscape databases,
-primarily focused on the International Soundscape Database (ISD).
+primarily focused on the International Soundscape Database (ISD) and the
+Soundscape Attributes Translation Project (SATP).
 """
 
-from soundscapy.databases import isd, satp
+from soundscapy.databases import isd, satp, satp_testing
 
-__all__ = ["isd", "satp"]
+__all__ = ["isd", "satp", "satp_testing"]
diff --git a/src/soundscapy/databases/satp_testing.py b/src/soundscapy/databases/satp_testing.py
new file mode 100644
index 0000000..1dfefca
--- /dev/null
+++ b/src/soundscapy/databases/satp_testing.py
@@ -0,0 +1,500 @@
+"""
+Module for translation testing of soundscape attributes based on SATP methodology.
+
+This module provides functions for computing translation quality metrics and
+statistical tests for validating translations of soundscape attributes. It is
+based on the methodology from the Soundscape Attributes Translation Project (SATP).
+
+The translation quality is assessed using multiple criteria:
+- APPR: Appropriateness (0-1 scale, normalized from 0-10)
+- UNDR: Understandability (0-1 scale, normalized from 0-10)
+- CLAR: Clarity (computed from association with word and counter-word)
+- ANTO: Antonymy (for main axes only)
+- ORTH: Orthogonality (for main axes only)
+- NCON: Non-confusability (for main axes only)
+- IBAL: Importance balance
+- CONN: Connectedness (for derived axes only)
+
+Examples
+--------
+>>> import pandas as pd
+>>> import soundscapy.databases.satp_testing as satp_testing  # doctest: +SKIP
+>>> # Compute main axis criteria from raw data
+>>> df = pd.DataFrame({  # doctest: +SKIP
+...     'COUNTRY': ['SG', 'SG'],
+...     'APPR': [8.0, 9.0],
+...     'UNDR': [7.5, 8.5],
+...     'ANTO': [8.0, 7.0],
+...     'BIAS': [5.0, 4.5],
+...     'ASSOCCW': [6.0, 5.5],
+...     'IMPCCW': [4.0, 3.5],
+...     'ASSOCW': [7.0, 6.5],
+...     'IMPCW': [5.0, 4.5],
+...     'CANDIDATE': ['pleasant', 'pleasant']
+... })
+>>> result = satp_testing.compute_main_axis_criteria(df)  # doctest: +SKIP
+>>> 'CLAR' in result.columns  # doctest: +SKIP
+True
+
+References
+----------
+Based on the R code from the SATP project:
+https://github.com/ntudsp/satp-zsm-stage1
+
+"""
+
+from typing import Literal
+
+import numpy as np
+import pandas as pd
+from loguru import logger
+from scipy import stats
+
+# Constants for statistical tests
+MANN_WHITNEY_NUM_GROUPS = 2  # Mann-Whitney test requires exactly 2 groups
+
+
+def compute_main_axis_criteria(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compute translation quality criteria for main axis attributes.
+
+    Main axes are the primary soundscape dimensions (e.g., pleasant-annoying,
+    eventful-uneventful) that have antonymic relationships.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing raw survey responses with columns:
+        - APPR: Appropriateness rating (0-10 scale)
+        - UNDR: Understandability rating (0-10 scale)
+        - ANTO: Antonymy rating (0-10 scale)
+        - BIAS: Bias rating (0-10 scale)
+        - ASSOCW: Association with word (0-10 scale)
+        - ASSOCCW: Association with counter-word (0-10 scale)
+        - IMPCW: Importance with word (0-10 scale)
+        - IMPCCW: Importance with counter-word (0-10 scale)
+        Additional columns (e.g., COUNTRY, CANDIDATE) will be preserved.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with computed criteria:
+        - APPR: Normalized appropriateness (0-1 scale)
+        - UNDR: Normalized understandability (0-1 scale)
+        - ANTO: Normalized antonymy (0-1 scale)
+        - CLAR: Clarity (0-1 scale, higher is better)
+        - ORTH: Orthogonality (0-1 scale, higher is better)
+        - NCON: Non-confusability (0-1 scale, higher is better)
+        - IBAL: Importance balance (0-1 scale, higher is better)
+        Plus all original non-computed columns.
+
+    Notes
+    -----
+    The criteria are computed as follows:
+    - APPR, UNDR, ANTO: Normalized from 0-10 to 0-1 scale
+    - CLAR: 1 - 0.5*(ASSOCW/10) - 0.5*(ASSOCCW/10)
+    - ORTH: 1 - 2*|BIAS/10 - 0.5|
+    - NCON: 1 - 0.5*(IMPCW/10 + IMPCCW/10)
+    - IBAL: 1 - |IMPCCW/10 - IMPCW/10|
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'APPR': [8.0, 9.0],
+    ...     'UNDR': [7.5, 8.5],
+    ...     'ANTO': [8.0, 7.0],
+    ...     'BIAS': [5.0, 4.5],
+    ...     'ASSOCCW': [6.0, 5.5],
+    ...     'IMPCCW': [4.0, 3.5],
+    ...     'ASSOCW': [7.0, 6.5],
+    ...     'IMPCW': [5.0, 4.5],
+    ...     'CANDIDATE': ['pleasant', 'pleasant']
+    ... })
+    >>> result = compute_main_axis_criteria(df)
+    >>> result['APPR'].iloc[0]
+    0.8
+    >>> round(result['CLAR'].iloc[0], 2)
+    0.35
+    >>> result['ORTH'].iloc[0]
+    1.0
+
+    """
+    result = df.copy()
+
+    # Normalize 0-10 scale to 0-1 scale
+    result["APPR"] = result["APPR"] / 10
+    result["UNDR"] = result["UNDR"] / 10
+    result["ANTO"] = result["ANTO"] / 10
+
+    # Compute derived criteria
+    result["CLAR"] = 1 - 0.5 * result["ASSOCW"] / 10 - 0.5 * result["ASSOCCW"] / 10
+    result["ORTH"] = 1 - 2 * np.abs(result["BIAS"] / 10 - 0.5)
+    result["NCON"] = 1 - 0.5 * (result["IMPCW"] / 10 + result["IMPCCW"] / 10)
+    result["IBAL"] = 1 - np.abs(result["IMPCCW"] / 10 - result["IMPCW"] / 10)
+
+    logger.debug("Computed main axis criteria for dataset")
+    return result
+
+
+def compute_derived_axis_criteria(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compute translation quality criteria for derived axis attributes.
+
+    Derived axes are secondary soundscape dimensions (e.g., vibrant, calm,
+    monotonous, chaotic) that do not necessarily have direct antonyms.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing raw survey responses with columns:
+        - APPR: Appropriateness rating (0-10 scale)
+        - UNDR: Understandability rating (0-10 scale)
+        - ASSOCW: Association with word (0-10 scale)
+        - ASSOCCW: Association with counter-word (0-10 scale)
+        - IMPCW: Importance with word (0-10 scale)
+        - IMPCCW: Importance with counter-word (0-10 scale)
+        Additional columns (e.g., COUNTRY, CANDIDATE) will be preserved.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with computed criteria:
+        - APPR: Normalized appropriateness (0-1 scale)
+        - UNDR: Normalized understandability (0-1 scale)
+        - CLAR: Clarity (0-1 scale, higher is better)
+        - CONN: Connectedness (0-1 scale, higher is better)
+        - IBAL: Importance balance (0-1 scale, higher is better)
+        Plus all original non-computed columns.
+
+    Notes
+    -----
+    The criteria are computed as follows:
+    - APPR, UNDR: Normalized from 0-10 to 0-1 scale
+    - CLAR: 1 - 0.5*(ASSOCW/10) - 0.5*(ASSOCCW/10)
+    - CONN: 0.5*(IMPCW/10 + IMPCCW/10)
+    - IBAL: 1 - |IMPCCW/10 - IMPCW/10|
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'APPR': [8.0, 9.0],
+    ...     'UNDR': [7.5, 8.5],
+    ...     'ASSOCCW': [6.0, 5.5],
+    ...     'IMPCCW': [4.0, 3.5],
+    ...     'ASSOCW': [7.0, 6.5],
+    ...     'IMPCW': [5.0, 4.5],
+    ...     'CANDIDATE': ['vibrant', 'vibrant']
+    ... })
+    >>> result = compute_derived_axis_criteria(df)
+    >>> result['APPR'].iloc[0]
+    0.8
+    >>> result['CONN'].iloc[0]
+    0.45
+
+    """
+    result = df.copy()
+
+    # Normalize 0-10 scale to 0-1 scale
+    result["APPR"] = result["APPR"] / 10
+    result["UNDR"] = result["UNDR"] / 10
+
+    # Compute derived criteria
+    result["CLAR"] = 1 - 0.5 * result["ASSOCW"] / 10 - 0.5 * result["ASSOCCW"] / 10
+    result["CONN"] = 0.5 * (result["IMPCW"] / 10 + result["IMPCCW"] / 10)
+    result["IBAL"] = 1 - np.abs(result["IMPCCW"] / 10 - result["IMPCW"] / 10)
+
+    logger.debug("Computed derived axis criteria for dataset")
+    return result
+
+
+def summarize_main_axis(
+    df: pd.DataFrame,
+    by_country: bool = False,  # noqa: FBT001, FBT002
+) -> pd.DataFrame:
+    """
+    Summarize main axis criteria by candidate translation.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with computed main axis criteria (output of
+        compute_main_axis_criteria).
+    by_country : bool, optional
+        If True, group by both COUNTRY and CANDIDATE.
+        If False, group by CANDIDATE only. Default is False.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with mean values for each criterion, grouped by
+        CANDIDATE (and COUNTRY if by_country=True).
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'CANDIDATE': ['pleasant', 'pleasant', 'annoying', 'annoying'],
+    ...     'COUNTRY': ['SG', 'MY', 'SG', 'MY'],
+    ...     'APPR': [0.8, 0.85, 0.75, 0.8],
+    ...     'UNDR': [0.75, 0.8, 0.7, 0.75],
+    ...     'CLAR': [0.6, 0.65, 0.55, 0.6],
+    ...     'ANTO': [0.8, 0.85, 0.75, 0.8],
+    ...     'ORTH': [0.9, 0.95, 0.85, 0.9],
+    ...     'NCON': [0.7, 0.75, 0.65, 0.7],
+    ...     'IBAL': [0.85, 0.9, 0.8, 0.85]
+    ... })
+    >>> result = summarize_main_axis(df, by_country=False)
+    >>> len(result)
+    2
+    >>> 'APPR' in result.columns
+    True
+
+    """
+    criteria = ["APPR", "UNDR", "CLAR", "ANTO", "ORTH", "NCON", "IBAL"]
+
+    if by_country:
+        grouped = df.groupby(["COUNTRY", "CANDIDATE"])[criteria].mean().reset_index()
+    else:
+        grouped = df.groupby("CANDIDATE")[criteria].mean().reset_index()
+
+    logger.debug(f"Summarized main axis data (by_country={by_country})")
+    return grouped
+
+
+def summarize_derived_axis(
+    df: pd.DataFrame,
+    by_country: bool = False,  # noqa: FBT001, FBT002
+) -> pd.DataFrame:
+    """
+    Summarize derived axis criteria by candidate translation.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with computed derived axis criteria (output of
+        compute_derived_axis_criteria).
+    by_country : bool, optional
+        If True, group by both COUNTRY and CANDIDATE.
+        If False, group by CANDIDATE only. Default is False.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with mean values for each criterion, grouped by
+        CANDIDATE (and COUNTRY if by_country=True).
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'CANDIDATE': ['vibrant', 'vibrant', 'calm', 'calm'],
+    ...     'COUNTRY': ['SG', 'MY', 'SG', 'MY'],
+    ...     'APPR': [0.8, 0.85, 0.75, 0.8],
+    ...     'UNDR': [0.75, 0.8, 0.7, 0.75],
+    ...     'CLAR': [0.6, 0.65, 0.55, 0.6],
+    ...     'CONN': [0.7, 0.75, 0.65, 0.7],
+    ...     'IBAL': [0.85, 0.9, 0.8, 0.85]
+    ... })
+    >>> result = summarize_derived_axis(df, by_country=False)
+    >>> len(result)
+    2
+    >>> 'CONN' in result.columns
+    True
+
+    """
+    criteria = ["APPR", "UNDR", "CLAR", "CONN", "IBAL"]
+
+    if by_country:
+        grouped = df.groupby(["COUNTRY", "CANDIDATE"])[criteria].mean().reset_index()
+    else:
+        grouped = df.groupby("CANDIDATE")[criteria].mean().reset_index()
+
+    logger.debug(f"Summarized derived axis data (by_country={by_country})")
+    return grouped
+
+
+def kruskal_wallis_test(
+    df: pd.DataFrame,
+    axis_type: Literal["main", "derived"],
+    independent_var: str = "CANDIDATE"
+) -> pd.DataFrame:
+    """
+    Perform Kruskal-Wallis test for each criterion across groups.
+
+    The Kruskal-Wallis test is a non-parametric test to determine if there
+    are significant differences between groups. This is used to test if
+    different translation candidates have significantly different quality scores.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with computed criteria.
+    axis_type : {"main", "derived"}
+        Type of axis being tested.
+    independent_var : str, optional
+        Column name to group by for the test. Default is "CANDIDATE".
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with test results including:
+        - CRITERION: Name of the criterion tested
+        - statistic: Kruskal-Wallis H statistic
+        - pvalue: p-value for the test
+        - effect_size: Effect size (epsilon squared)
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> np.random.seed(42)
+    >>> df = pd.DataFrame({
+    ...     'CANDIDATE': ['A'] * 10 + ['B'] * 10 + ['C'] * 10,
+    ...     'APPR': np.random.uniform(0.5, 1.0, 30),
+    ...     'UNDR': np.random.uniform(0.5, 1.0, 30),
+    ...     'CLAR': np.random.uniform(0.4, 0.9, 30),
+    ...     'CONN': np.random.uniform(0.5, 0.9, 30),
+    ...     'IBAL': np.random.uniform(0.7, 1.0, 30)
+    ... })
+    >>> result = kruskal_wallis_test(df, axis_type="derived")
+    >>> len(result) == 5  # One row per criterion (derived has 5 criteria)
+    True
+    >>> 'pvalue' in result.columns
+    True
+
+    """
+    if axis_type == "main":
+        criteria = ["APPR", "UNDR", "CLAR", "ANTO", "ORTH", "NCON", "IBAL"]
+    elif axis_type == "derived":
+        criteria = ["APPR", "UNDR", "CLAR", "CONN", "IBAL"]
+    else:
+        msg = "axis_type must be either 'main' or 'derived'"
+        raise ValueError(msg)
+
+    results = []
+
+    for criterion in criteria:
+        # Group data by independent variable
+        groups = [
+            group[criterion].to_numpy()
+            for name, group in df.groupby(independent_var)
+        ]
+
+        # Perform Kruskal-Wallis test
+        statistic, pvalue = stats.kruskal(*groups)
+
+        # Calculate effect size (epsilon squared)
+        n = len(df)
+        k = len(groups)
+        effect_size = (statistic - k + 1) / (n - k)
+
+        results.append({
+            "CRITERION": criterion,
+            "statistic": statistic,
+            "pvalue": pvalue,
+            "effect_size": effect_size
+        })
+
+    result_df = pd.DataFrame(results)
+    logger.debug(f"Performed Kruskal-Wallis test for {axis_type} axis")
+    return result_df
+
+
+def mann_whitney_test(
+    df: pd.DataFrame,
+    criteria: list[str],
+    paq_attribute: str,
+    group_var: str = "COUNTRY"
+) -> pd.DataFrame:
+    """
+    Perform Mann-Whitney-Wilcoxon test for each criterion and candidate.
+
+    This test compares two groups (e.g., two countries) for each translation
+    candidate to determine if there are significant differences in quality scores.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with computed criteria.
+    criteria : list[str]
+        List of criteria column names to test.
+    paq_attribute : str
+        Name of the PAQ (Perceived Affective Quality) attribute being tested
+        (e.g., "pleasant", "eventful").
+    group_var : str, optional
+        Column name for grouping variable (e.g., "COUNTRY"). Default is "COUNTRY".
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with test results including:
+        - PAQ: PAQ attribute name
+        - CRITERION: Name of the criterion tested
+        - CANDIDATE: Translation candidate
+        - statistic: U statistic
+        - pvalue: p-value for the test
+        - adjusted_pvalue: Bonferroni-adjusted p-value
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> np.random.seed(42)
+    >>> df = pd.DataFrame({
+    ...     'COUNTRY': ['SG'] * 10 + ['MY'] * 10,
+    ...     'CANDIDATE': ['pleasant'] * 20,
+    ...     'APPR': np.random.uniform(0.5, 1.0, 20),
+    ...     'UNDR': np.random.uniform(0.5, 1.0, 20)
+    ... })
+    >>> result = mann_whitney_test(df, ['APPR', 'UNDR'], 'pleasant')
+    >>> len(result) == 2  # One row per criterion
+    True
+
+    """
+    results = []
+
+    for criterion in criteria:
+        for candidate in df["CANDIDATE"].unique():
+            # Filter data for this candidate
+            candidate_data = df[df["CANDIDATE"] == candidate]
+
+            # Get groups
+            groups = candidate_data[group_var].unique()
+            if len(groups) != MANN_WHITNEY_NUM_GROUPS:
+                logger.warning(
+                    f"Skipping {candidate} for {criterion}: "
+                    f"Expected {MANN_WHITNEY_NUM_GROUPS} groups, found {len(groups)}"
+                )
+                continue
+
+            group1 = candidate_data[candidate_data[group_var] == groups[0]][criterion]
+            group2 = candidate_data[candidate_data[group_var] == groups[1]][criterion]
+
+            # Perform Mann-Whitney U test
+            statistic, pvalue = stats.mannwhitneyu(
+                group1, group2, alternative="two-sided"
+            )
+
+            # Bonferroni correction (multiply by number of groups)
+            adjusted_pvalue = min(pvalue * MANN_WHITNEY_NUM_GROUPS, 1.0)
+
+            results.append({
+                "PAQ": paq_attribute,
+                "CRITERION": criterion,
+                "CANDIDATE": candidate,
+                "statistic": statistic,
+                "pvalue": pvalue,
+                "adjusted_pvalue": adjusted_pvalue
+            })
+
+    result_df = pd.DataFrame(results)
+    logger.debug(f"Performed Mann-Whitney test for {paq_attribute}")
+    return result_df
+
+
+if __name__ == "__main__":
+    import xdoctest
+
+    xdoctest.doctest_module(__file__)
diff --git a/test/databases/test_satp_testing.py b/test/databases/test_satp_testing.py
new file mode 100644
index 0000000..b826462
--- /dev/null
+++ b/test/databases/test_satp_testing.py
@@ -0,0 +1,447 @@
+"""Tests for the SATP translation testing module."""
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from soundscapy.databases import satp_testing
+
+
+class TestComputeMainAxisCriteria:
+    """Test suite for compute_main_axis_criteria function."""
+    
+    def test_basic_computation(self):
+        """Test basic computation of main axis criteria."""
+        df = pd.DataFrame({
+            'APPR': [8.0, 9.0],
+            'UNDR': [7.5, 8.5],
+            'ANTO': [8.0, 7.0],
+            'BIAS': [5.0, 4.5],
+            'ASSOCCW': [6.0, 5.5],
+            'IMPCCW': [4.0, 3.5],
+            'ASSOCW': [7.0, 6.5],
+            'IMPCW': [5.0, 4.5],
+            'CANDIDATE': ['pleasant', 'pleasant']
+        })
+        
+        result = satp_testing.compute_main_axis_criteria(df)
+        
+        # Check normalization of 0-10 scale to 0-1
+        assert result['APPR'].iloc[0] == 0.8
+        assert result['UNDR'].iloc[0] == 0.75
+        assert result['ANTO'].iloc[0] == 0.8
+        
+        # Check computed criteria
+        assert 'CLAR' in result.columns
+        assert 'ORTH' in result.columns
+        assert 'NCON' in result.columns
+        assert 'IBAL' in result.columns
+        
+        # Check that CANDIDATE column is preserved
+        assert 'CANDIDATE' in result.columns
+    
+    def test_clar_computation(self):
+        """Test CLAR (clarity) computation formula."""
+        df = pd.DataFrame({
+            'APPR': [5.0], 'UNDR': [5.0], 'ANTO': [5.0], 'BIAS': [5.0],
+            'ASSOCCW': [4.0], 'IMPCCW': [5.0],
+            'ASSOCW': [6.0], 'IMPCW': [5.0],
+            'CANDIDATE': ['test']
+        })
+        
+        result = satp_testing.compute_main_axis_criteria(df)
+        # CLAR = 1 - 0.5*(6.0/10) - 0.5*(4.0/10) = 1 - 0.3 - 0.2 = 0.5
+        assert result['CLAR'].iloc[0] == pytest.approx(0.5)
+    
+    def test_orth_computation(self):
+        """Test ORTH (orthogonality) computation formula."""
+        df = pd.DataFrame({
+            'APPR': [5.0], 'UNDR': [5.0], 'ANTO': [5.0], 'BIAS': [5.0],
+            'ASSOCCW': [5.0], 'IMPCCW': [5.0],
+            'ASSOCW': [5.0], 'IMPCW': [5.0],
+            'CANDIDATE': ['test']
+        })
+        
+        result = satp_testing.compute_main_axis_criteria(df)
+        # ORTH = 1 - 2*|5.0/10 - 0.5| = 1 - 2*|0.5 - 0.5| = 1.0
+        assert result['ORTH'].iloc[0] == pytest.approx(1.0)
+        
+        # Test with bias
+        df['BIAS'] = [7.0]
+        result = satp_testing.compute_main_axis_criteria(df)
+        # ORTH = 1 - 2*|7.0/10 - 0.5| = 1 - 2*0.2 = 0.6
+        assert result['ORTH'].iloc[0] == pytest.approx(0.6)
+    
+    def test_ncon_computation(self):
+        """Test NCON (non-confusability) computation formula."""
+        df = pd.DataFrame({
+            'APPR': [5.0], 'UNDR': [5.0], 'ANTO': [5.0], 'BIAS': [5.0],
+            'ASSOCCW': [5.0], 'IMPCCW': [4.0],
+            'ASSOCW': [5.0], 'IMPCW': [6.0],
+            'CANDIDATE': ['test']
+        })
+        
+        result = satp_testing.compute_main_axis_criteria(df)
+        # NCON = 1 - 0.5*(6.0/10 + 4.0/10) = 1 - 0.5*1.0 = 0.5
+        assert result['NCON'].iloc[0] == pytest.approx(0.5)
+    
+    def test_ibal_computation(self):
+        """Test IBAL (importance balance) computation formula."""
+        df = pd.DataFrame({
+            'APPR': [5.0], 'UNDR': [5.0], 'ANTO': [5.0], 'BIAS': [5.0],
+            'ASSOCCW': [5.0], 'IMPCCW': [6.0],
+            'ASSOCW': [5.0], 'IMPCW': [4.0],
+            'CANDIDATE': ['test']
+        })
+        
+        result = satp_testing.compute_main_axis_criteria(df)
+        # IBAL = 1 - |6.0/10 - 4.0/10| = 1 - 0.2 = 0.8
+        assert result['IBAL'].iloc[0] == pytest.approx(0.8)
+    
+    def test_preserves_other_columns(self):
+        """Test that non-computed columns are preserved."""
+        df = pd.DataFrame({
+            'COUNTRY': ['SG', 'MY'],
+            'APPR': [8.0, 9.0],
+            'UNDR': [7.5, 8.5],
+            'ANTO': [8.0, 7.0],
+            'BIAS': [5.0, 4.5],
+            'ASSOCCW': [6.0, 5.5],
+            'IMPCCW': [4.0, 3.5],
+            'ASSOCW': [7.0, 6.5],
+            'IMPCW': [5.0, 4.5],
+            'CANDIDATE': ['pleasant', 'pleasant']
+        })
+        
+        result = satp_testing.compute_main_axis_criteria(df)
+        assert 'COUNTRY' in result.columns
+        assert result['COUNTRY'].tolist() == ['SG', 'MY']
+
+
+class TestComputeDerivedAxisCriteria:
+    """Test suite for compute_derived_axis_criteria function."""
+    
+    def test_basic_computation(self):
+        """Test basic computation of derived axis criteria."""
+        df = pd.DataFrame({
+            'APPR': [8.0, 9.0],
+            'UNDR': [7.5, 8.5],
+            'ASSOCCW': [6.0, 5.5],
+            'IMPCCW': [4.0, 3.5],
+            'ASSOCW': [7.0, 6.5],
+            'IMPCW': [5.0, 4.5],
+            'CANDIDATE': ['vibrant', 'vibrant']
+        })
+        
+        result = satp_testing.compute_derived_axis_criteria(df)
+        
+        # Check normalization
+        assert result['APPR'].iloc[0] == 0.8
+        assert result['UNDR'].iloc[0] == 0.75
+        
+        # Check computed criteria
+        assert 'CLAR' in result.columns
+        assert 'CONN' in result.columns
+        assert 'IBAL' in result.columns
+        
+        # Should not have main-axis-only criteria
+        assert 'ANTO' not in result.columns
+        assert 'ORTH' not in result.columns
+        assert 'NCON' not in result.columns
+    
+    def test_conn_computation(self):
+        """Test CONN (connectedness) computation formula."""
+        df = pd.DataFrame({
+            'APPR': [5.0], 'UNDR': [5.0],
+            'ASSOCCW': [5.0], 'IMPCCW': [4.0],
+            'ASSOCW': [5.0], 'IMPCW': [6.0],
+            'CANDIDATE': ['test']
+        })
+        
+        result = satp_testing.compute_derived_axis_criteria(df)
+        # CONN = 0.5*(6.0/10 + 4.0/10) = 0.5*1.0 = 0.5
+        assert result['CONN'].iloc[0] == pytest.approx(0.5)
+    
+    def test_clar_computation_derived(self):
+        """Test CLAR computation for derived axis (same as main axis)."""
+        df = pd.DataFrame({
+            'APPR': [5.0], 'UNDR': [5.0],
+            'ASSOCCW': [4.0], 'IMPCCW': [5.0],
+            'ASSOCW': [6.0], 'IMPCW': [5.0],
+            'CANDIDATE': ['test']
+        })
+        
+        result = satp_testing.compute_derived_axis_criteria(df)
+        # CLAR = 1 - 0.5*(6.0/10) - 0.5*(4.0/10) = 1 - 0.3 - 0.2 = 0.5
+        assert result['CLAR'].iloc[0] == pytest.approx(0.5)
+
+
+class TestSummarizeMainAxis:
+    """Test suite for summarize_main_axis function."""
+    
+    def test_summary_without_country(self):
+        """Test summary aggregation without country grouping."""
+        df = pd.DataFrame({
+            'CANDIDATE': ['pleasant', 'pleasant', 'annoying', 'annoying'],
+            'COUNTRY': ['SG', 'MY', 'SG', 'MY'],
+            'APPR': [0.8, 0.85, 0.75, 0.8],
+            'UNDR': [0.75, 0.8, 0.7, 0.75],
+            'CLAR': [0.6, 0.65, 0.55, 0.6],
+            'ANTO': [0.8, 0.85, 0.75, 0.8],
+            'ORTH': [0.9, 0.95, 0.85, 0.9],
+            'NCON': [0.7, 0.75, 0.65, 0.7],
+            'IBAL': [0.85, 0.9, 0.8, 0.85]
+        })
+        
+        result = satp_testing.summarize_main_axis(df, by_country=False)
+        
+        # Should have one row per candidate
+        assert len(result) == 2
+        assert set(result['CANDIDATE']) == {'pleasant', 'annoying'}
+        
+        # Check that means are computed correctly
+        pleasant_row = result[result['CANDIDATE'] == 'pleasant'].iloc[0]
+        assert pleasant_row['APPR'] == pytest.approx(0.825)
+    
+    def test_summary_with_country(self):
+        """Test summary aggregation with country grouping."""
+        df = pd.DataFrame({
+            'CANDIDATE': ['pleasant', 'pleasant', 'pleasant', 'pleasant'],
+            'COUNTRY': ['SG', 'SG', 'MY', 'MY'],
+            'APPR': [0.8, 0.82, 0.85, 0.87],
+            'UNDR': [0.75, 0.77, 0.8, 0.82],
+            'CLAR': [0.6, 0.62, 0.65, 0.67],
+            'ANTO': [0.8, 0.82, 0.85, 0.87],
+            'ORTH': [0.9, 0.92, 0.95, 0.97],
+            'NCON': [0.7, 0.72, 0.75, 0.77],
+            'IBAL': [0.85, 0.87, 0.9, 0.92]
+        })
+        
+        result = satp_testing.summarize_main_axis(df, by_country=True)
+        
+        # Should have one row per country-candidate combination
+        assert len(result) == 2
+        assert 'COUNTRY' in result.columns
+        
+        # Check means for SG
+        sg_row = result[result['COUNTRY'] == 'SG'].iloc[0]
+        assert sg_row['APPR'] == pytest.approx(0.81)
+
+
+class TestSummarizeDerivedAxis:
+    """Test suite for summarize_derived_axis function."""
+    
+    def test_summary_without_country(self):
+        """Test summary aggregation without country grouping."""
+        df = pd.DataFrame({
+            'CANDIDATE': ['vibrant', 'vibrant', 'calm', 'calm'],
+            'COUNTRY': ['SG', 'MY', 'SG', 'MY'],
+            'APPR': [0.8, 0.85, 0.75, 0.8],
+            'UNDR': [0.75, 0.8, 0.7, 0.75],
+            'CLAR': [0.6, 0.65, 0.55, 0.6],
+            'CONN': [0.7, 0.75, 0.65, 0.7],
+            'IBAL': [0.85, 0.9, 0.8, 0.85]
+        })
+        
+        result = satp_testing.summarize_derived_axis(df, by_country=False)
+        
+        # Should have one row per candidate
+        assert len(result) == 2
+        assert set(result['CANDIDATE']) == {'vibrant', 'calm'}
+        
+        # Should have CONN but not main-axis-only criteria
+        assert 'CONN' in result.columns
+        assert 'ANTO' not in result.columns
+        assert 'ORTH' not in result.columns
+        assert 'NCON' not in result.columns
+
+
+class TestKruskalWallisTest:
+    """Test suite for kruskal_wallis_test function."""
+    
+    def test_main_axis_test(self):
+        """Test Kruskal-Wallis test for main axis."""
+        np.random.seed(42)
+        df = pd.DataFrame({
+            'CANDIDATE': ['A'] * 10 + ['B'] * 10 + ['C'] * 10,
+            'APPR': np.random.uniform(0.5, 1.0, 30),
+            'UNDR': np.random.uniform(0.5, 1.0, 30),
+            'CLAR': np.random.uniform(0.4, 0.9, 30),
+            'ANTO': np.random.uniform(0.5, 1.0, 30),
+            'ORTH': np.random.uniform(0.6, 1.0, 30),
+            'NCON': np.random.uniform(0.4, 0.8, 30),
+            'IBAL': np.random.uniform(0.7, 1.0, 30)
+        })
+        
+        result = satp_testing.kruskal_wallis_test(df, axis_type="main")
+        
+        # Should have one row per criterion
+        assert len(result) == 7
+        
+        # Check columns exist
+        assert 'CRITERION' in result.columns
+        assert 'statistic' in result.columns
+        assert 'pvalue' in result.columns
+        assert 'effect_size' in result.columns
+        
+        # All criteria should be present
+        expected_criteria = ['APPR', 'UNDR', 'CLAR', 'ANTO', 'ORTH', 'NCON', 'IBAL']
+        assert set(result['CRITERION']) == set(expected_criteria)
+    
+    def test_derived_axis_test(self):
+        """Test Kruskal-Wallis test for derived axis."""
+        np.random.seed(42)
+        df = pd.DataFrame({
+            'CANDIDATE': ['A'] * 10 + ['B'] * 10 + ['C'] * 10,
+            'APPR': np.random.uniform(0.5, 1.0, 30),
+            'UNDR': np.random.uniform(0.5, 1.0, 30),
+            'CLAR': np.random.uniform(0.4, 0.9, 30),
+            'CONN': np.random.uniform(0.5, 0.9, 30),
+            'IBAL': np.random.uniform(0.7, 1.0, 30)
+        })
+        
+        result = satp_testing.kruskal_wallis_test(df, axis_type="derived")
+        
+        # Should have one row per criterion (5 for derived axis)
+        assert len(result) == 5
+        
+        # Derived axis criteria
+        expected_criteria = ['APPR', 'UNDR', 'CLAR', 'CONN', 'IBAL']
+        assert set(result['CRITERION']) == set(expected_criteria)
+    
+    def test_invalid_axis_type(self):
+        """Test that invalid axis type raises ValueError."""
+        df = pd.DataFrame({
+            'CANDIDATE': ['A', 'B'],
+            'APPR': [0.5, 0.6]
+        })
+        
+        with pytest.raises(ValueError, match="axis_type must be either 'main' or 'derived'"):
+            satp_testing.kruskal_wallis_test(df, axis_type="invalid")
+
+
+class TestMannWhitneyTest:
+    """Test suite for mann_whitney_test function."""
+    
+    def test_basic_test(self):
+        """Test Mann-Whitney test with two countries."""
+        np.random.seed(42)
+        df = pd.DataFrame({
+            'COUNTRY': ['SG'] * 10 + ['MY'] * 10,
+            'CANDIDATE': ['pleasant'] * 20,
+            'APPR': np.random.uniform(0.5, 1.0, 20),
+            'UNDR': np.random.uniform(0.5, 1.0, 20)
+        })
+        
+        result = satp_testing.mann_whitney_test(
+            df, ['APPR', 'UNDR'], 'pleasant'
+        )
+        
+        # Should have one row per criterion
+        assert len(result) == 2
+        
+        # Check columns
+        assert 'PAQ' in result.columns
+        assert 'CRITERION' in result.columns
+        assert 'CANDIDATE' in result.columns
+        assert 'statistic' in result.columns
+        assert 'pvalue' in result.columns
+        assert 'adjusted_pvalue' in result.columns
+        
+        # Check PAQ attribute
+        assert all(result['PAQ'] == 'pleasant')
+    
+    def test_multiple_candidates(self):
+        """Test Mann-Whitney test with multiple candidates."""
+        np.random.seed(42)
+        df = pd.DataFrame({
+            'COUNTRY': ['SG'] * 10 + ['MY'] * 10 + ['SG'] * 10 + ['MY'] * 10,
+            'CANDIDATE': ['pleasant'] * 20 + ['annoying'] * 20,
+            'APPR': np.random.uniform(0.5, 1.0, 40)
+        })
+        
+        result = satp_testing.mann_whitney_test(df, ['APPR'], 'test')
+        
+        # Should have one row per candidate
+        assert len(result) == 2
+        assert set(result['CANDIDATE']) == {'pleasant', 'annoying'}
+    
+    def test_bonferroni_correction(self):
+        """Test that Bonferroni correction is applied."""
+        np.random.seed(42)
+        df = pd.DataFrame({
+            'COUNTRY': ['SG'] * 10 + ['MY'] * 10,
+            'CANDIDATE': ['pleasant'] * 20,
+            'APPR': np.random.uniform(0.5, 1.0, 20)
+        })
+        
+        result = satp_testing.mann_whitney_test(df, ['APPR'], 'pleasant')
+        
+        # Adjusted p-value should be 2x original (or capped at 1.0)
+        pvalue = result['pvalue'].iloc[0]
+        adjusted = result['adjusted_pvalue'].iloc[0]
+        assert adjusted == pytest.approx(min(pvalue * 2, 1.0))
+
+
+class TestIntegration:
+    """Integration tests for the complete workflow."""
+    
+    def test_main_axis_workflow(self):
+        """Test complete workflow for main axis."""
+        # Create mock survey data with multiple candidates
+        np.random.seed(42)
+        n_per_candidate = 5
+        df = pd.DataFrame({
+            'COUNTRY': ['SG', 'MY'] * n_per_candidate + ['SG', 'MY'] * n_per_candidate,
+            'APPR': np.random.uniform(7, 10, n_per_candidate * 4),
+            'UNDR': np.random.uniform(7, 10, n_per_candidate * 4),
+            'ANTO': np.random.uniform(7, 10, n_per_candidate * 4),
+            'BIAS': np.random.uniform(4, 6, n_per_candidate * 4),
+            'ASSOCCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'IMPCCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'ASSOCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'IMPCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'CANDIDATE': ['pleasant'] * (n_per_candidate * 2) + ['annoying'] * (n_per_candidate * 2)
+        })
+        
+        # Step 1: Compute criteria
+        computed = satp_testing.compute_main_axis_criteria(df)
+        assert 'CLAR' in computed.columns
+        assert 'ORTH' in computed.columns
+        
+        # Step 2: Summarize
+        summary = satp_testing.summarize_main_axis(computed, by_country=False)
+        assert len(summary) == 2  # Two candidates
+        
+        # Step 3: Statistical test
+        result = satp_testing.kruskal_wallis_test(computed, axis_type="main")
+        assert len(result) == 7
+    
+    def test_derived_axis_workflow(self):
+        """Test complete workflow for derived axis."""
+        # Create mock survey data with multiple candidates
+        np.random.seed(42)
+        n_per_candidate = 5
+        df = pd.DataFrame({
+            'COUNTRY': ['SG', 'MY'] * n_per_candidate + ['SG', 'MY'] * n_per_candidate,
+            'APPR': np.random.uniform(7, 10, n_per_candidate * 4),
+            'UNDR': np.random.uniform(7, 10, n_per_candidate * 4),
+            'ASSOCCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'IMPCCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'ASSOCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'IMPCW': np.random.uniform(3, 7, n_per_candidate * 4),
+            'CANDIDATE': ['vibrant'] * (n_per_candidate * 2) + ['calm'] * (n_per_candidate * 2)
+        })
+        
+        # Step 1: Compute criteria
+        computed = satp_testing.compute_derived_axis_criteria(df)
+        assert 'CLAR' in computed.columns
+        assert 'CONN' in computed.columns
+        
+        # Step 2: Summarize
+        summary = satp_testing.summarize_derived_axis(computed, by_country=True)
+        assert len(summary) == 4  # Two countries x two candidates
+        
+        # Step 3: Statistical test
+        result = satp_testing.kruskal_wallis_test(computed, axis_type="derived")
+        assert len(result) == 5