wavlab-speech · ftshijt · Jun 16, 2025 · Jun 19, 2025 · Jun 19, 2025 · Jun 30, 2025
diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md
@@ -50,7 +50,7 @@ We include x mark if the metric is auto-installed in versa.
 | 43 | x | Qwen2 Recording Environment - Background | qwen2_speech_background_environment_metric | qwen2_speech_background_environment_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
 | 44 | x | Qwen2 Recording Environment - Quality | qwen2_recording_quality_metric | qwen2_recording_quality_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
 | 45 | x | Qwen2 Recording Environment - Channel Type | qwen2_channel_type_metric | qwen2_channel_type_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
-| 46 | x | Dimensional Emotion | w2v2_dimensional_emotion | w2v2_dimensional_emotion | [w2v2-how-to](https://github.com/audeering/w2v2-how-to) | [paper](https://arxiv.org/pdf/2203.07378) |
+| 46 | x | Dimensional Emotion | emo_vad | arousal_emo_vad, valence_emo_vad, dominance_emo_vad | [w2v2-how-to](https://github.com/audeering/w2v2-how-to) | [paper](https://arxiv.org/pdf/2203.07378) |
 | 47 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) |
 | 48 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech  | pseudo_mos | dnsmos_pro_bvcc | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |
 | 49 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech  | pseudo_mos | dnsmos_pro_nisqa | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |

diff --git a/egs/separate_metrics/cdpam_distance.yaml b/egs/separate_metrics/cdpam_distance.yaml
@@ -0,0 +1,5 @@
+# CDPAM distance metrics
+# CDPAM distance between audio samples
+# More info in https://github.com/facebookresearch/audiocraft
+# -- cdpam_distance: the CDPAM distance between audio samples
+- name: cdpam_distance 
diff --git a/egs/separate_metrics/chroma_alignment.yaml b/egs/separate_metrics/chroma_alignment.yaml
@@ -0,0 +1,20 @@
+# Chroma Alignment related metrics
+# Chroma-based distance estimation with dynamic programming alignment
+# Uses librosa chroma features (STFT, CQT, CENS) with DTW alignment
+# -- chroma_stft_cosine_dtw: STFT chroma features with cosine distance and DTW
+# -- chroma_stft_euclidean_dtw: STFT chroma features with euclidean distance and DTW
+# -- chroma_cqt_cosine_dtw: CQT chroma features with cosine distance and DTW
+# -- chroma_cqt_euclidean_dtw: CQT chroma features with euclidean distance and DTW
+# -- chroma_cens_cosine_dtw: CENS chroma features with cosine distance and DTW
+# -- chroma_cens_euclidean_dtw: CENS chroma features with euclidean distance and DTW
+# -- chroma_stft_cosine_dtw_raw: Raw DTW distance with higher scaling
+# -- chroma_stft_cosine_dtw_log: Log-scaled DTW distance
+- name: chroma_alignment
+  sample_rate: 22050
+  feature_types: ["stft", "cqt", "cens"]
+  distance_metrics: ["cosine", "euclidean"]
+  scale_factor: 100.0
+  normalize: True
+  normalize_by_path: True
+  return_alignment: False
+  chroma_kwargs: {} 
diff --git a/egs/separate_metrics/dpam_distance.yaml b/egs/separate_metrics/dpam_distance.yaml
@@ -0,0 +1,5 @@
+# DPAM distance metrics
+# DPAM distance between audio samples
+# More info in https://github.com/adrienchaton/PerceptualAudio_Pytorch
+# -- dpam_distance: the DPAM distance between audio samples
+- name: dpam_distance 
diff --git a/egs/separate_metrics/emo_vad.yaml b/egs/separate_metrics/emo_vad.yaml
@@ -0,0 +1,7 @@
+# EmoVad related metrics
+# Dimensional emotion prediction (arousal, valence, dominance) using w2v2-how-to
+# More info in https://github.com/audeering/w2v2-how-to
+# -- arousal_emo_vad: the dimensional emotion prediction with w2v2
+# -- valence_emo_vad: the dimensional emotion prediction with w2v2
+# -- dominance_emo_vad: the dimensional emotion prediction with w2v2
+- name: emo_vad 
diff --git a/egs/separate_metrics/lid.yaml b/egs/separate_metrics/lid.yaml
@@ -1,10 +1,10 @@
-
-# Word error rate with ESPnet-OWSM model
+# Language Identification with ESPnet-OWSM model
 # More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
 # The default model is `espnet/owsm_v3.1_ebf`.
-# --lid: the nbest language tag
+# --language: the nbest language tag
 - name: lid
   model_tag: default
   nbest: 5
+  use_gpu: false
 
 
diff --git a/egs/separate_metrics/nisqa.yaml b/egs/separate_metrics/nisqa.yaml
@@ -3,8 +3,9 @@
 #  -- nisqa_noi_pred: NISQA noise prediction
 #  -- nisqa_dis_pred: NISQA distortion prediction
 #  -- nisqa_col_pred: NISQA color prediction
-#  --nisqa_loud_pred: NISQA loudness prediction
+#  -- nisqa_loud_pred: NISQA loudness prediction
 
 # NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
 - name: nisqa
   nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
+  use_gpu: false
diff --git a/egs/separate_metrics/nomad.yaml b/egs/separate_metrics/nomad.yaml
@@ -2,3 +2,4 @@
 # -- nomad: nomad reference-based model
 - name: nomad
   model_cache: versa_cache/nomad_pt-models
+  use_gpu: false
diff --git a/egs/separate_metrics/noresqa.yaml b/egs/separate_metrics/noresqa.yaml
@@ -1,4 +1,8 @@
 # noresqa related metrics
-# -- noresqa: non-matching reference based speech quality assessment
-- name: noresqa
-  metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
+# -- noresqa_mos: NORESQA-MOS (metric_type=1)
+# -- noresqa_score: NORESQA-score (metric_type=0)
+- name: noresqa_mos
+  metric_type: 1  # 0: NORESQA-score, 1: NORESQA-MOS
+  model_tag: default
+  cache_dir: versa_cache/noresqa_model
+  use_gpu: false
diff --git a/egs/separate_metrics/pesq.yaml b/egs/separate_metrics/pesq.yaml
@@ -0,0 +1,11 @@
+# PESQ: Perceptual Evaluation of Speech Quality
+# https://www.itu.int/rec/T-REC-P.862
+# 
+# PESQ is a reference-based metric that measures speech quality
+# by comparing a degraded signal to a reference signal.
+# 
+# Supported sample rates:
+# - 8kHz: narrowband (nb) mode
+# - 16kHz: wideband (wb) mode
+# - Other rates: automatically resampled to nearest supported rate
+- name: pesq 
diff --git a/egs/separate_metrics/w2v2_dimensional_emotion.yaml b/egs/separate_metrics/w2v2_dimensional_emotion.yaml
diff --git a/setup.py b/setup.py
@@ -1,62 +1,108 @@
 from setuptools import setup, find_packages
+import os
+
+
+# Read README for long description
+def read_readme():
+    readme_path = os.path.join(os.path.dirname(__file__), "README.md")
+    if os.path.exists(readme_path):
+        with open(readme_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return "A package for versatile evaluation of speech and audio"
+
 
 setup(
     name="versa-speech-audio-toolkit",
     version="1.0.0",
+    author="Jiatong Shi",
+    author_email="ftshijt@gmail.com",
+    description="A package for versatile evaluation of speech and audio",
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/wavlab-speech/versa.git",
     packages=find_packages(),
+    python_requires=">=3.8",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Multimedia :: Sound/Audio :: Analysis",
+    ],
+    keywords=["speech", "audio", "metrics", "evaluation", "machine learning"],
     install_requires=[
+        # Core ML and Deep Learning
+        "torch",
+        "torchaudio",
+        "transformers>=4.36.2",
         "accelerate",
-        "audioread",
-        "ci-sdr",
-        "Cython",
-        "Distance",
-        "editdistance",
-        "einops",
-        "espnet @ git+https://github.com/ftshijt/espnet.git@espnet_inference#egg=espnet",
-        "espnet-tts-frontend",
-        "fast-bss-eval",
-        "fastdtw",
         "huggingface-hub",
-        "hydra-core",
-        "idna",
-        "importlib-metadata",
-        "kaggle",
-        "kaldiio",
-        "lazy_loader",
-        "Levenshtein",
-        "librosa",
-        "mir-eval",
-        "omegaconf",
-        "onnxruntime",
-        # NOTE(jiatong): use the latest commit for python 3.13
-        "openai-whisper @ git+https://github.com/openai/whisper.git",
+        "safetensors",
+        "tokenizers",
+        "einops",
         "opt-einsum",
-        "pesq",
-        "protobuf",
+        # Audio Processing
+        "librosa",
+        "soundfile",
+        "audioread",
+        "resampy",
+        "torchlibrosa",
+        "pyworld",
         "pysptk",
+        # Speech and Audio Evaluation Metrics
+        "pesq",
         "pystoi",
-        "python-dateutil",
-        "pyworld",
-        "pyyaml",
+        "mir-eval",
+        "fast-bss-eval",
+        "ci-sdr",
+        "speechmos",
+        # Text Processing and Distance Metrics
+        "Levenshtein",
+        "editdistance",
+        "Distance",
         "rapidfuzz",
-        "resampy",
-        "safetensors",
-        "scikit-learn",
         "sentencepiece",
-        "setuptools",
-        "soundfile",
-        "speechmos",
+        # Scientific Computing
+        "scikit-learn",
         "sympy",
         "threadpoolctl",
-        "tokenizers",
-        "torch",
-        "torch-complex",
-        "torchaudio",
-        "torchlibrosa",
-        "s3prl @ git+https://github.com/ftshijt/s3prl.git@numpy2#egg=s3prl",
-        "transformers>=4.36.2",
+        # Configuration and Utilities
+        "hydra-core",
+        "omegaconf",
+        "pyyaml",
+        "protobuf",
+        "python-dateutil",
+        "lazy_loader",
+        # Build and Compatibility
+        "Cython",
+        "setuptools",
+        "importlib-metadata",
+        "idna",
+        # Optional/External Services
+        "kaggle",
+        "kaldiio",
+        "fastdtw",
+        "onnxruntime",
+        # Git Dependencies - Speech/Audio Frameworks
+        "espnet @ git+https://github.com/ftshijt/espnet.git@espnet_inference#egg=espnet",
+        "espnet-tts-frontend",
         "espnet_model_zoo",
+        "s3prl",
+        # Git Dependencies - Audio Models
+        # NOTE: Using latest commit for Python 3.13 compatibility
+        "openai-whisper @ git+https://github.com/openai/whisper.git",
+        # Git Dependencies - Evaluation Metrics
         "discrete-speech-metrics @ git+https://github.com/ftshijt/DiscreteSpeechMetrics.git@v1.0.2",
+        # Additional Dependencies
+        "torch-complex",
         "cdpam",
     ],
     extras_require={
@@ -65,16 +111,25 @@
             "pytest-cov>=2.10.0",
             "black>=22.3.0",
             "flake8>=4.0.0",
+            "isort>=5.0.0",
+            "mypy>=0.900",
+        ],
+        "docs": [
+            "sphinx>=4.0.0",
+            "sphinx-rtd-theme>=1.0.0",
+            "myst-parser>=0.17.0",
+        ],
+        "jupyter": [
+            "jupyter>=1.0.0",
+            "ipykernel>=6.0.0",
+            "matplotlib>=3.3.0",
         ],
     },
     entry_points={
         "console_scripts": [
             "versa-score=versa.bin.scorer:main",
         ],
     },
-    author="Jiatong Shi",
-    author_email="ftshijt@gmail.com",
-    description="A package for versatile evaluation of speech and audio",
-    url="https://github.com/shinjiwlab/versa.git",
-    keywords="speech metrics",
+    include_package_data=True,
+    zip_safe=False,
 )