Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ We include x mark if the metric is auto-installed in versa.
| 43 | x | Qwen2 Recording Environment - Background | qwen2_speech_background_environment_metric | qwen2_speech_background_environment_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
| 44 | x | Qwen2 Recording Environment - Quality | qwen2_recording_quality_metric | qwen2_recording_quality_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
| 45 | x | Qwen2 Recording Environment - Channel Type | qwen2_channel_type_metric | qwen2_channel_type_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
| 46 | x | Dimensional Emotion | w2v2_dimensional_emotion | w2v2_dimensional_emotion | [w2v2-how-to](https://github.com/audeering/w2v2-how-to) | [paper](https://arxiv.org/pdf/2203.07378) |
| 46 | x | Dimensional Emotion | emo_vad | arousal_emo_vad, valence_emo_vad, dominance_emo_vad | [w2v2-how-to](https://github.com/audeering/w2v2-how-to) | [paper](https://arxiv.org/pdf/2203.07378) |
| 47 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) |
| 48 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_bvcc | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |
| 49 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_nisqa | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |
Expand Down
5 changes: 5 additions & 0 deletions egs/separate_metrics/cdpam_distance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CDPAM distance metrics
# CDPAM distance between audio samples
# More info in https://github.com/facebookresearch/audiocraft
# -- cdpam_distance: the CDPAM distance between audio samples
- name: cdpam_distance
20 changes: 20 additions & 0 deletions egs/separate_metrics/chroma_alignment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Chroma Alignment related metrics
# Chroma-based distance estimation with dynamic programming alignment
# Uses librosa chroma features (STFT, CQT, CENS) with DTW alignment
# -- chroma_stft_cosine_dtw: STFT chroma features with cosine distance and DTW
# -- chroma_stft_euclidean_dtw: STFT chroma features with euclidean distance and DTW
# -- chroma_cqt_cosine_dtw: CQT chroma features with cosine distance and DTW
# -- chroma_cqt_euclidean_dtw: CQT chroma features with euclidean distance and DTW
# -- chroma_cens_cosine_dtw: CENS chroma features with cosine distance and DTW
# -- chroma_cens_euclidean_dtw: CENS chroma features with euclidean distance and DTW
# -- chroma_stft_cosine_dtw_raw: Raw DTW distance with higher scaling
# -- chroma_stft_cosine_dtw_log: Log-scaled DTW distance
- name: chroma_alignment
sample_rate: 22050
feature_types: ["stft", "cqt", "cens"]
distance_metrics: ["cosine", "euclidean"]
scale_factor: 100.0
normalize: True
normalize_by_path: True
return_alignment: False
chroma_kwargs: {}
5 changes: 5 additions & 0 deletions egs/separate_metrics/dpam_distance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# DPAM distance metrics
# DPAM distance between audio samples
# More info in https://github.com/adrienchaton/PerceptualAudio_Pytorch
# -- dpam_distance: the DPAM distance between audio samples
- name: dpam_distance
7 changes: 7 additions & 0 deletions egs/separate_metrics/emo_vad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# EmoVad related metrics
# Dimensional emotion prediction (arousal, valence, dominance) using w2v2-how-to
# More info in https://github.com/audeering/w2v2-how-to
# -- arousal_emo_vad: the dimensional emotion prediction with w2v2
# -- valence_emo_vad: the dimensional emotion prediction with w2v2
# -- dominance_emo_vad: the dimensional emotion prediction with w2v2
- name: emo_vad
6 changes: 3 additions & 3 deletions egs/separate_metrics/lid.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

# Word error rate with ESPnet-OWSM model
# Language Identification with ESPnet-OWSM model
# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
# The default model is `espnet/owsm_v3.1_ebf`.
# --lid: the nbest language tag
# --language: the nbest language tag
- name: lid
model_tag: default
nbest: 5
use_gpu: false


3 changes: 2 additions & 1 deletion egs/separate_metrics/nisqa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
# -- nisqa_noi_pred: NISQA noise prediction
# -- nisqa_dis_pred: NISQA distortion prediction
# -- nisqa_col_pred: NISQA color prediction
# --nisqa_loud_pred: NISQA loudness prediction
# -- nisqa_loud_pred: NISQA loudness prediction

# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
- name: nisqa
nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
use_gpu: false
1 change: 1 addition & 0 deletions egs/separate_metrics/nomad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
# -- nomad: nomad reference-based model
- name: nomad
model_cache: versa_cache/nomad_pt-models
use_gpu: false
10 changes: 7 additions & 3 deletions egs/separate_metrics/noresqa.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# noresqa related metrics
# -- noresqa: non-matching reference based speech quality assessment
- name: noresqa
metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
# -- noresqa_mos: NORESQA-MOS (metric_type=1)
# -- noresqa_score: NORESQA-score (metric_type=0)
- name: noresqa_mos
metric_type: 1 # 0: NORESQA-score, 1: NORESQA-MOS
model_tag: default
cache_dir: versa_cache/noresqa_model
use_gpu: false
11 changes: 11 additions & 0 deletions egs/separate_metrics/pesq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# PESQ: Perceptual Evaluation of Speech Quality
# https://www.itu.int/rec/T-REC-P.862
#
# PESQ is a reference-based metric that measures speech quality
# by comparing a degraded signal to a reference signal.
#
# Supported sample rates:
# - 8kHz: narrowband (nb) mode
# - 16kHz: wideband (wb) mode
# - Other rates: automatically resampled to nearest supported rate
- name: pesq
5 changes: 0 additions & 5 deletions egs/separate_metrics/w2v2_dimensional_emotion.yaml

This file was deleted.

147 changes: 101 additions & 46 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,108 @@
from setuptools import setup, find_packages
import os


# Read README for long description
def read_readme():
readme_path = os.path.join(os.path.dirname(__file__), "README.md")
if os.path.exists(readme_path):
with open(readme_path, "r", encoding="utf-8") as f:
return f.read()
return "A package for versatile evaluation of speech and audio"


setup(
name="versa-speech-audio-toolkit",
version="1.0.0",
author="Jiatong Shi",
author_email="ftshijt@gmail.com",
description="A package for versatile evaluation of speech and audio",
long_description=read_readme(),
long_description_content_type="text/markdown",
url="https://github.com/wavlab-speech/versa.git",
packages=find_packages(),
python_requires=">=3.8",
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Multimedia :: Sound/Audio :: Analysis",
],
keywords=["speech", "audio", "metrics", "evaluation", "machine learning"],
install_requires=[
# Core ML and Deep Learning
"torch",
"torchaudio",
"transformers>=4.36.2",
"accelerate",
"audioread",
"ci-sdr",
"Cython",
"Distance",
"editdistance",
"einops",
"espnet @ git+https://github.com/ftshijt/espnet.git@espnet_inference#egg=espnet",
"espnet-tts-frontend",
"fast-bss-eval",
"fastdtw",
"huggingface-hub",
"hydra-core",
"idna",
"importlib-metadata",
"kaggle",
"kaldiio",
"lazy_loader",
"Levenshtein",
"librosa",
"mir-eval",
"omegaconf",
"onnxruntime",
# NOTE(jiatong): use the latest commit for python 3.13
"openai-whisper @ git+https://github.com/openai/whisper.git",
"safetensors",
"tokenizers",
"einops",
"opt-einsum",
"pesq",
"protobuf",
# Audio Processing
"librosa",
"soundfile",
"audioread",
"resampy",
"torchlibrosa",
"pyworld",
"pysptk",
# Speech and Audio Evaluation Metrics
"pesq",
"pystoi",
"python-dateutil",
"pyworld",
"pyyaml",
"mir-eval",
"fast-bss-eval",
"ci-sdr",
"speechmos",
# Text Processing and Distance Metrics
"Levenshtein",
"editdistance",
"Distance",
"rapidfuzz",
"resampy",
"safetensors",
"scikit-learn",
"sentencepiece",
"setuptools",
"soundfile",
"speechmos",
# Scientific Computing
"scikit-learn",
"sympy",
"threadpoolctl",
"tokenizers",
"torch",
"torch-complex",
"torchaudio",
"torchlibrosa",
"s3prl @ git+https://github.com/ftshijt/s3prl.git@numpy2#egg=s3prl",
"transformers>=4.36.2",
# Configuration and Utilities
"hydra-core",
"omegaconf",
"pyyaml",
"protobuf",
"python-dateutil",
"lazy_loader",
# Build and Compatibility
"Cython",
"setuptools",
"importlib-metadata",
"idna",
# Optional/External Services
"kaggle",
"kaldiio",
"fastdtw",
"onnxruntime",
# Git Dependencies - Speech/Audio Frameworks
"espnet @ git+https://github.com/ftshijt/espnet.git@espnet_inference#egg=espnet",
"espnet-tts-frontend",
"espnet_model_zoo",
"s3prl",
# Git Dependencies - Audio Models
# NOTE: Using latest commit for Python 3.13 compatibility
"openai-whisper @ git+https://github.com/openai/whisper.git",
# Git Dependencies - Evaluation Metrics
"discrete-speech-metrics @ git+https://github.com/ftshijt/DiscreteSpeechMetrics.git@v1.0.2",
# Additional Dependencies
"torch-complex",
"cdpam",
],
extras_require={
Expand All @@ -65,16 +111,25 @@
"pytest-cov>=2.10.0",
"black>=22.3.0",
"flake8>=4.0.0",
"isort>=5.0.0",
"mypy>=0.900",
],
"docs": [
"sphinx>=4.0.0",
"sphinx-rtd-theme>=1.0.0",
"myst-parser>=0.17.0",
],
"jupyter": [
"jupyter>=1.0.0",
"ipykernel>=6.0.0",
"matplotlib>=3.3.0",
],
},
entry_points={
"console_scripts": [
"versa-score=versa.bin.scorer:main",
],
},
author="Jiatong Shi",
author_email="ftshijt@gmail.com",
description="A package for versatile evaluation of speech and audio",
url="https://github.com/shinjiwlab/versa.git",
keywords="speech metrics",
include_package_data=True,
zip_safe=False,
)
Loading