wavlab-speech · whr-a · Jun 18, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,5 @@ fadtk/
 scoreq/
 fairseq/
 UTMOSv2/
+versa_cache/
+hub/
diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md
@@ -101,14 +101,17 @@ We include x mark if the metric is auto-installed in versa.
 | 3 | x | ESPnet Speech Recognition-based Error Rate | espnet_wer | espnet_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/pdf/1804.00015) |
 | 4 | x | ESPnet-OWSM Speech Recognition-based Error Rate | owsm_wer | owsm_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2309.13876) |
 | 5 | x | OpenAI-Whisper Speech Recognition-based Error Rate | whisper_wer | whisper_wer |[Whisper](https://github.com/openai/whisper) | [paper](https://arxiv.org/abs/2212.04356) |
-| 6 |   | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | 
-| 7 | x | Speaker Embedding Similarity  | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) |
-| 8 |   | NOMAD: Unsupervised Learning of Perceptual Embeddings For Speech Enhancement and Non-Matching Reference Audio Quality Assessment |  nomad | nomad |[Nomad](https://github.com/shimhz/nomad/tree/main) | [paper](https://arxiv.org/abs/2309.16284) |
-| 9 |   | Contrastive Language-Audio Pretraining Score (CLAP Score) | clap_score | clap_score | [fadtk](https://github.com/gudgud96/frechet-audio-distance) | [paper](https://arxiv.org/abs/2301.12661) |
-| 10 |   | Accompaniment Prompt Adherence (APA) | apa | apa | [Sony-audio-metrics](https://github.com/SonyCSLParis/audio-metrics) | [paper](https://arxiv.org/abs/2404.00775) |
-| 11 |  | Log Likelihood Ratio (LLR) | pysepm | pysepm_llr | [pysepm](https://github.com/shimhz/pysepm.git) | [Paper](https://ecs.utdallas.edu/loizou/speech/obj_paper_jan08.pdf)|
-| 12 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) with Paired Text | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) |
-| 13 |  | Singer Embedding Similarity  | singer | singer_similarity | [SSL-Singer-Identity](https://github.com/SonyCSLParis/ssl-singer-identity) | [paper](https://hal.science/hal-04186048v1) |
+| 6 |   | Faster-Whisper Speech Recognition-based Error Rate | fwhisper_wer | fwhisper_wer |[Faster-Whisper](https://github.com/systran/faster-whisper) | - |
+| 7 |   | NVIDIA Conformer-Transducer X-Large Speech Recognition-based Error Rate | nemo_wer | nemo_wer |[NeMo](https://github.com/NVIDIA/NeMo) | [paper](https://arxiv.org/abs/2005.08100) |
+| 8 | x | Facebook Hubert-Large-Finetuned Speech Recognition-based Error Rate | hubert_wer | hubert_wer |[HuBERT](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) | [paper](https://arxiv.org/abs/2106.07447) |
+| 9 |   | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | 
+| 10 | x | Speaker Embedding Similarity  | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) |
+| 11 |   | NOMAD: Unsupervised Learning of Perceptual Embeddings For Speech Enhancement and Non-Matching Reference Audio Quality Assessment |  nomad | nomad |[Nomad](https://github.com/shimhz/nomad/tree/main) | [paper](https://arxiv.org/abs/2309.16284) |
+| 12 |   | Contrastive Language-Audio Pretraining Score (CLAP Score) | clap_score | clap_score | [fadtk](https://github.com/gudgud96/frechet-audio-distance) | [paper](https://arxiv.org/abs/2301.12661) |
+| 13 |   | Accompaniment Prompt Adherence (APA) | apa | apa | [Sony-audio-metrics](https://github.com/SonyCSLParis/audio-metrics) | [paper](https://arxiv.org/abs/2404.00775) |
+| 14 |  | Log Likelihood Ratio (LLR) | pysepm | pysepm_llr | [pysepm](https://github.com/shimhz/pysepm.git) | [Paper](https://ecs.utdallas.edu/loizou/speech/obj_paper_jan08.pdf)|
+| 15 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) with Paired Text | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) |
+| 16 |  | Singer Embedding Similarity  | singer | singer_similarity | [SSL-Singer-Identity](https://github.com/SonyCSLParis/ssl-singer-identity) | [paper](https://hal.science/hal-04186048v1) |
 
 ### Distributional Metrics (in verifying)
 

diff --git a/egs/separate_metrics/wer.yaml b/egs/separate_metrics/wer.yaml
@@ -41,7 +41,7 @@
 # More model_tag can be from https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages .
 # The default model is `large-v3`.
 # NOTE(jiatong): further aggregation are necessary for corpus-level WER/CER
-# --whisper_hyp_text: the hypothesis from ESPnet ASR decoding
+# --whisper_hyp_text: the hypothesis from Whisper ASR decoding
 # --ref_text: reference text (after cleaner)
 # --whisper_wer_delete: delete errors
 # --whisper_wer_insert: insertion errors
@@ -54,4 +54,61 @@
 - name: whisper_wer
   model_tag: default
   beam_size: 5
+  text_cleaner: whisper_basic
+
+
+# Word error rate with faster-whisper model
+# Please refer to tools/install_fwhisper.sh for installing.
+# More model_tag can be from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/utils.py .
+# The default model is `large-v3`.
+# --fwhisper_hyp_text: the hypothesis from faster-whisper ASR decoding
+# --ref_text: reference text (after cleaner)
+# --fwhisper_wer_delete: delete errors
+# --fwhisper_wer_insert: insertion errors
+# --fwhisper_wer_replace: replacement errors
+# --fwhisper_wer_equal: correct matching words/character counts
+# --fwhisper_cer_delete: delete errors
+# --fwhisper_cer_insert: insertion errors
+# --fwhisper_cer_replace: replacement errors
+# --fwhisper_cer_equal: correct matching words/character counts
+- name: fwhisper_wer
+  model_tag: default
+  beam_size: 5
+  batch_size: 1
+  compute_type: float32
+  text_cleaner: whisper_basic
+
+
+# Word error rate with NeMo asr model
+# Please refer to tools/install_nemo.sh for installing.
+# The default model is `nvidia/stt_en_conformer_transducer_xlarge`.
+# --nemo_hyp_text: the hypothesis from NeMo ASR decoding
+# --ref_text: reference text (after cleaner)
+# --nemo_wer_delete: delete errors
+# --nemo_wer_insert: insertion errors
+# --nemo_wer_replace: replacement errors
+# --nemo_wer_equal: correct matching words/character counts
+# --nemo_cer_delete: delete errors
+# --nemo_cer_insert: insertion errors
+# --nemo_cer_replace: replacement errors
+# --nemo_cer_equal: correct matching words/character counts
+- name: nemo_wer
+  model_tag: default
+  text_cleaner: whisper_basic
+
+
+# Word error rate with Hubert-Large-Finetuned model
+# The default model is `facebook/hubert-large-ls960-ft`.
+# --hubert_hyp_text: the hypothesis from Hubert ASR decoding
+# --ref_text: reference text (after cleaner)
+# --hubert_wer_delete: delete errors
+# --hubert_wer_insert: insertion errors
+# --hubert_wer_replace: replacement errors
+# --hubert_wer_equal: correct matching words/character counts
+# --hubert_cer_delete: delete errors
+# --hubert_cer_insert: insertion errors
+# --hubert_cer_replace: replacement errors
+# --hubert_cer_equal: correct matching words/character counts
+- name: hubert_wer
+  model_tag: default
   text_cleaner: whisper_basic
diff --git a/test/test_pipeline/test_general.py b/test/test_pipeline/test_general.py
@@ -40,6 +40,11 @@
     "torch_squim_stoi": 0.6027805209159851,
     "torch_squim_pesq": 1.1683127880096436,
     "torch_squim_si_sdr": -11.109052658081055,
+    "dpam_distance": 0.15004253387451172,
+    "cdpam_distance": 0.05146043747663498,
+    "dnsmos_pro_bvcc": 1.1717286109924316,
+    "dnsmos_pro_nisqa": 1.4733699560165405,
+    "dnsmos_pro_vcc2018": 1.930935263633728,
 }
 
 
@@ -77,7 +82,7 @@ def info_update():
             # for sir"
             continue
         # the plc mos is undeterministic
-        if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos":
+        if abs(TEST_INFO[key] - summary[key]) > 2e-4 and key != "plcmos":
             raise ValueError(
                 "Value issue in the test case, might be some issue in scorer {}".format(
                     key

diff --git a/test/test_pipeline/test_wer.py b/test/test_pipeline/test_wer.py
@@ -12,68 +12,49 @@
 )
 
 TEST_INFO = {
-    "mcd": 5.045226506332897,
-    "f0rmse": 20.28100448994277,
-    "f0corr": -0.07540903652440145,
-    "sdr": 4.8739529795936445,
-    "sir": float("inf"),
-    "sar": 4.8739529795936445,
-    "si_snr": 1.0702757835388184,
-    "ci_sdr": 4.873954772949219,
-    "pesq": 1.5722705125808716,
-    "stoi": 0.0076251088596473275,
-    "speech_bert": 0.9727544188499451,
-    "speech_bleu": 0.6699938983346256,
-    "speech_token_distance": 0.850506056080969,
-    "utmos": 1.9074358940124512,
-    "dns_overall": 1.4526059573614438,
-    "dns_p808": 2.094302177429199,
-    "plcmos": 3.1603124300638834,
-    "spk_similarity": 0.8953609466552734,
-    "singmos": 2.0403053760528564,
-    "sheet_ssqa": 1.5056110620498657,
-    "se_sdr": -10.220606003834313,
-    "se_sar": -10.220606003834313,
-    "se_si_snr": -16.837072372436523,
-    "se_ci_sdr": -10.220602989196777,
+    "espnet_wer_equal": 1,
+    "owsm_wer_equal": 1,
+    "whisper_wer_equal": 1,
+    "fwhisper_wer_equal": 1,
+    "nemo_wer_equal": 1,
+    "hubert_wer_equal": 1,
 }
 
 
 def info_update():
 
     # find files
-    if os.path.isdir("test/test_samples/test2"):
-        gen_files = find_files("test/test_samples/test2")
-
-    # find reference file
-    if os.path.isdir("test/test_samples/test1"):
-        gt_files = find_files("test/test_samples/test1")
+    if os.path.isdir("test/test_samples/test_wer"):
+        gen_files = find_files("test/test_samples/test_wer")
 
     logging.info("The number of utterances = %d" % len(gen_files))
 
-    with open("egs/speech.yaml", "r", encoding="utf-8") as f:
+    with open("egs/separate_metrics/wer.yaml", "r", encoding="utf-8") as f:
         score_config = yaml.full_load(f)
 
     score_modules = load_score_modules(
         score_config,
-        use_gt=(True if gt_files is not None else False),
+        use_gt=False,
+        use_gt_text=True,
         use_gpu=False,
     )
 
     assert len(score_config) > 0, "no scoring function is provided"
 
+    text_info = {}
+    with open("test/test_samples/text_wer") as f:
+        for line in f.readlines():
+            key, value = line.strip().split(maxsplit=1)
+            text_info[key] = value
+
     score_info = list_scoring(
-        gen_files, score_modules, gt_files, output_file=None, io="soundfile"
+        gen_files, score_modules, text_info=text_info, output_file=None, io="soundfile"
     )
     summary = load_summary(score_info)
     print("Summary: {}".format(load_summary(score_info)), flush=True)
 
-    for key in summary:
-        if math.isinf(TEST_INFO[key]) and math.isinf(summary[key]):
-            # for sir"
-            continue
-        # the plc mos is undeterministic
-        if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos":
+    for key in TEST_INFO:
+        if abs(TEST_INFO[key] - summary[key]) > 0 and not key == "espnet_wer_equal":
             raise ValueError(
                 "Value issue in the test case, might be some issue in scorer {}".format(
                     key

diff --git a/test/test_samples/test_wer/test_wer.wav b/test/test_samples/test_wer/test_wer.wav
diff --git a/test/test_samples/text_wer b/test/test_samples/text_wer
@@ -0,0 +1 @@
+test_wer.wav Look!
diff --git a/tools/install_fwhisper.sh b/tools/install_fwhisper.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -e
+
+pip install faster-whisper
+
+if ! command -v nvcc &>/dev/null; then
+  echo "Error: nvcc not found. Please install the CUDA Toolkit first." >&2
+  exit 1
+fi
+
+cuda_ver=$(nvcc --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p')
+cuda_major=${cuda_ver%%.*}
+echo "Detected CUDA version:$cuda_ver"
+
+if [ "$cuda_major" -ge 12 ]; then
+  conda install -c conda-forge "cudnn=9.*" "numpy<2.3"
+elif [ "$cuda_major" -eq 11 ]; then
+  conda install -c conda-forge "cudnn=8.*" "numpy<2.3"
+  pip install --force-reinstall 'ctranslate2==3.24.0' 'numpy<2.2'
+else
+  echo "Error: Unsupported CUDA major version $cuda_major" >&2
+  exit 1
+fi
diff --git a/tools/install_nemo.sh b/tools/install_nemo.sh
@@ -0,0 +1,5 @@
+#/bin/bash
+
+# NOTE(Haoran): Toolkit for nemo_wer
+
+pip install "nemo_toolkit[asr]"
diff --git a/versa/__init__.py b/versa/__init__.py
@@ -55,6 +55,18 @@
     whisper_levenshtein_metric,
     whisper_wer_setup,
 )
+from versa.corpus_metrics.fwhisper_wer import (
+    fwhisper_levenshtein_metric,
+    fwhisper_wer_setup,
+)
+from versa.corpus_metrics.nemo_wer import (
+    nemo_levenshtein_metric,
+    nemo_wer_setup,
+)
+from versa.corpus_metrics.hubert_wer import (
+    hubert_levenshtein_metric,
+    hubert_wer_setup,
+)
 from versa.utterance_metrics.asr_matching import asr_match_metric, asr_match_setup
 from versa.utterance_metrics.audiobox_aesthetics_score import (
     audiobox_aesthetics_score,