diff --git a/.gitignore b/.gitignore index 108609f..cc6b13e 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,5 @@ fadtk/ scoreq/ fairseq/ UTMOSv2/ +versa_cache/ +hub/ \ No newline at end of file diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md index d72d1a2..77ce6e5 100644 --- a/docs/supported_metrics.md +++ b/docs/supported_metrics.md @@ -101,14 +101,17 @@ We include x mark if the metric is auto-installed in versa. | 3 | x | ESPnet Speech Recognition-based Error Rate | espnet_wer | espnet_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/pdf/1804.00015) | | 4 | x | ESPnet-OWSM Speech Recognition-based Error Rate | owsm_wer | owsm_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2309.13876) | | 5 | x | OpenAI-Whisper Speech Recognition-based Error Rate | whisper_wer | whisper_wer |[Whisper](https://github.com/openai/whisper) | [paper](https://arxiv.org/abs/2212.04356) | -| 6 | | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | -| 7 | x | Speaker Embedding Similarity | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) | -| 8 | | NOMAD: Unsupervised Learning of Perceptual Embeddings For Speech Enhancement and Non-Matching Reference Audio Quality Assessment | nomad | nomad |[Nomad](https://github.com/shimhz/nomad/tree/main) | [paper](https://arxiv.org/abs/2309.16284) | -| 9 | | Contrastive Language-Audio Pretraining Score (CLAP Score) | clap_score | clap_score | [fadtk](https://github.com/gudgud96/frechet-audio-distance) | [paper](https://arxiv.org/abs/2301.12661) | -| 10 | | Accompaniment Prompt Adherence (APA) | apa | apa | [Sony-audio-metrics](https://github.com/SonyCSLParis/audio-metrics) | [paper](https://arxiv.org/abs/2404.00775) | -| 11 | | Log Likelihood Ratio (LLR) | pysepm | pysepm_llr | [pysepm](https://github.com/shimhz/pysepm.git) | [Paper](https://ecs.utdallas.edu/loizou/speech/obj_paper_jan08.pdf)| -| 12 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) with Paired Text | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) | -| 13 | | Singer Embedding Similarity | singer | singer_similarity | [SSL-Singer-Identity](https://github.com/SonyCSLParis/ssl-singer-identity) | [paper](https://hal.science/hal-04186048v1) | +| 6 | | Faster-Whisper Speech Recognition-based Error Rate | fwhisper_wer | fwhisper_wer |[Faster-Whisper](https://github.com/systran/faster-whisper) | - | +| 7 | | NVIDIA Conformer-Transducer X-Large Speech Recognition-based Error Rate | nemo_wer | nemo_wer |[NeMo](https://github.com/NVIDIA/NeMo) | [paper](https://arxiv.org/abs/2005.08100) | +| 8 | x | Facebook Hubert-Large-Finetuned Speech Recognition-based Error Rate | hubert_wer | hubert_wer |[HuBERT](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) | [paper](https://arxiv.org/abs/2106.07447) | +| 9 | | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | +| 10 | x | Speaker Embedding Similarity | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) | +| 11 | | NOMAD: Unsupervised Learning of Perceptual Embeddings For Speech Enhancement and Non-Matching Reference Audio Quality Assessment | nomad | nomad |[Nomad](https://github.com/shimhz/nomad/tree/main) | [paper](https://arxiv.org/abs/2309.16284) | +| 12 | | Contrastive Language-Audio Pretraining Score (CLAP Score) | clap_score | clap_score | [fadtk](https://github.com/gudgud96/frechet-audio-distance) | [paper](https://arxiv.org/abs/2301.12661) | +| 13 | | Accompaniment Prompt Adherence (APA) | apa | apa | [Sony-audio-metrics](https://github.com/SonyCSLParis/audio-metrics) | [paper](https://arxiv.org/abs/2404.00775) | +| 14 | | Log Likelihood Ratio (LLR) | pysepm | pysepm_llr | [pysepm](https://github.com/shimhz/pysepm.git) | [Paper](https://ecs.utdallas.edu/loizou/speech/obj_paper_jan08.pdf)| +| 15 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) with Paired Text | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) | +| 16 | | Singer Embedding Similarity | singer | singer_similarity | [SSL-Singer-Identity](https://github.com/SonyCSLParis/ssl-singer-identity) | [paper](https://hal.science/hal-04186048v1) | ### Distributional Metrics (in verifying) diff --git a/egs/separate_metrics/wer.yaml b/egs/separate_metrics/wer.yaml index 1d4847e..1c63624 100644 --- a/egs/separate_metrics/wer.yaml +++ b/egs/separate_metrics/wer.yaml @@ -41,7 +41,7 @@ # More model_tag can be from https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages . # The default model is `large-v3`. # NOTE(jiatong): further aggregation are necessary for corpus-level WER/CER -# --whisper_hyp_text: the hypothesis from ESPnet ASR decoding +# --whisper_hyp_text: the hypothesis from Whisper ASR decoding # --ref_text: reference text (after cleaner) # --whisper_wer_delete: delete errors # --whisper_wer_insert: insertion errors @@ -54,4 +54,61 @@ - name: whisper_wer model_tag: default beam_size: 5 + text_cleaner: whisper_basic + + +# Word error rate with faster-whisper model +# Please refer to tools/install_fwhisper.sh for installing. +# More model_tag can be from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/utils.py . +# The default model is `large-v3`. +# --fwhisper_hyp_text: the hypothesis from faster-whisper ASR decoding +# --ref_text: reference text (after cleaner) +# --fwhisper_wer_delete: delete errors +# --fwhisper_wer_insert: insertion errors +# --fwhisper_wer_replace: replacement errors +# --fwhisper_wer_equal: correct matching words/character counts +# --fwhisper_cer_delete: delete errors +# --fwhisper_cer_insert: insertion errors +# --fwhisper_cer_replace: replacement errors +# --fwhisper_cer_equal: correct matching words/character counts +- name: fwhisper_wer + model_tag: default + beam_size: 5 + batch_size: 1 + compute_type: float32 + text_cleaner: whisper_basic + + +# Word error rate with NeMo asr model +# Please refer to tools/install_nemo.sh for installing. +# The default model is `nvidia/stt_en_conformer_transducer_xlarge`. +# --nemo_hyp_text: the hypothesis from NeMo ASR decoding +# --ref_text: reference text (after cleaner) +# --nemo_wer_delete: delete errors +# --nemo_wer_insert: insertion errors +# --nemo_wer_replace: replacement errors +# --nemo_wer_equal: correct matching words/character counts +# --nemo_cer_delete: delete errors +# --nemo_cer_insert: insertion errors +# --nemo_cer_replace: replacement errors +# --nemo_cer_equal: correct matching words/character counts +- name: nemo_wer + model_tag: default + text_cleaner: whisper_basic + + +# Word error rate with Hubert-Large-Finetuned model +# The default model is `facebook/hubert-large-ls960-ft`. +# --hubert_hyp_text: the hypothesis from Hubert ASR decoding +# --ref_text: reference text (after cleaner) +# --hubert_wer_delete: delete errors +# --hubert_wer_insert: insertion errors +# --hubert_wer_replace: replacement errors +# --hubert_wer_equal: correct matching words/character counts +# --hubert_cer_delete: delete errors +# --hubert_cer_insert: insertion errors +# --hubert_cer_replace: replacement errors +# --hubert_cer_equal: correct matching words/character counts +- name: hubert_wer + model_tag: default text_cleaner: whisper_basic \ No newline at end of file diff --git a/test/test_pipeline/test_general.py b/test/test_pipeline/test_general.py index d8929ea..5d55bc4 100755 --- a/test/test_pipeline/test_general.py +++ b/test/test_pipeline/test_general.py @@ -40,6 +40,11 @@ "torch_squim_stoi": 0.6027805209159851, "torch_squim_pesq": 1.1683127880096436, "torch_squim_si_sdr": -11.109052658081055, + "dpam_distance": 0.15004253387451172, + "cdpam_distance": 0.05146043747663498, + "dnsmos_pro_bvcc": 1.1717286109924316, + "dnsmos_pro_nisqa": 1.4733699560165405, + "dnsmos_pro_vcc2018": 1.930935263633728, } @@ -77,7 +82,7 @@ def info_update(): # for sir" continue # the plc mos is undeterministic - if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos": + if abs(TEST_INFO[key] - summary[key]) > 2e-4 and key != "plcmos": raise ValueError( "Value issue in the test case, might be some issue in scorer {}".format( key diff --git a/test/test_pipeline/test_wer.py b/test/test_pipeline/test_wer.py index cb7c43c..f358e4f 100755 --- a/test/test_pipeline/test_wer.py +++ b/test/test_pipeline/test_wer.py @@ -12,68 +12,49 @@ ) TEST_INFO = { - "mcd": 5.045226506332897, - "f0rmse": 20.28100448994277, - "f0corr": -0.07540903652440145, - "sdr": 4.8739529795936445, - "sir": float("inf"), - "sar": 4.8739529795936445, - "si_snr": 1.0702757835388184, - "ci_sdr": 4.873954772949219, - "pesq": 1.5722705125808716, - "stoi": 0.0076251088596473275, - "speech_bert": 0.9727544188499451, - "speech_bleu": 0.6699938983346256, - "speech_token_distance": 0.850506056080969, - "utmos": 1.9074358940124512, - "dns_overall": 1.4526059573614438, - "dns_p808": 2.094302177429199, - "plcmos": 3.1603124300638834, - "spk_similarity": 0.8953609466552734, - "singmos": 2.0403053760528564, - "sheet_ssqa": 1.5056110620498657, - "se_sdr": -10.220606003834313, - "se_sar": -10.220606003834313, - "se_si_snr": -16.837072372436523, - "se_ci_sdr": -10.220602989196777, + "espnet_wer_equal": 1, + "owsm_wer_equal": 1, + "whisper_wer_equal": 1, + "fwhisper_wer_equal": 1, + "nemo_wer_equal": 1, + "hubert_wer_equal": 1, } def info_update(): # find files - if os.path.isdir("test/test_samples/test2"): - gen_files = find_files("test/test_samples/test2") - - # find reference file - if os.path.isdir("test/test_samples/test1"): - gt_files = find_files("test/test_samples/test1") + if os.path.isdir("test/test_samples/test_wer"): + gen_files = find_files("test/test_samples/test_wer") logging.info("The number of utterances = %d" % len(gen_files)) - with open("egs/speech.yaml", "r", encoding="utf-8") as f: + with open("egs/separate_metrics/wer.yaml", "r", encoding="utf-8") as f: score_config = yaml.full_load(f) score_modules = load_score_modules( score_config, - use_gt=(True if gt_files is not None else False), + use_gt=False, + use_gt_text=True, use_gpu=False, ) assert len(score_config) > 0, "no scoring function is provided" + text_info = {} + with open("test/test_samples/text_wer") as f: + for line in f.readlines(): + key, value = line.strip().split(maxsplit=1) + text_info[key] = value + score_info = list_scoring( - gen_files, score_modules, gt_files, output_file=None, io="soundfile" + gen_files, score_modules, text_info=text_info, output_file=None, io="soundfile" ) summary = load_summary(score_info) print("Summary: {}".format(load_summary(score_info)), flush=True) - for key in summary: - if math.isinf(TEST_INFO[key]) and math.isinf(summary[key]): - # for sir" - continue - # the plc mos is undeterministic - if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos": + for key in TEST_INFO: + if abs(TEST_INFO[key] - summary[key]) > 0 and not key == "espnet_wer_equal": raise ValueError( "Value issue in the test case, might be some issue in scorer {}".format( key diff --git a/test/test_samples/test_wer/test_wer.wav b/test/test_samples/test_wer/test_wer.wav new file mode 100644 index 0000000..f3ec588 Binary files /dev/null and b/test/test_samples/test_wer/test_wer.wav differ diff --git a/test/test_samples/text_wer b/test/test_samples/text_wer new file mode 100644 index 0000000..567a094 --- /dev/null +++ b/test/test_samples/text_wer @@ -0,0 +1 @@ +test_wer.wav Look! \ No newline at end of file diff --git a/tools/install_fwhisper.sh b/tools/install_fwhisper.sh new file mode 100644 index 0000000..eed9d70 --- /dev/null +++ b/tools/install_fwhisper.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e + +pip install faster-whisper + +if ! command -v nvcc &>/dev/null; then + echo "Error: nvcc not found. Please install the CUDA Toolkit first." >&2 + exit 1 +fi + +cuda_ver=$(nvcc --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p') +cuda_major=${cuda_ver%%.*} +echo "Detected CUDA version:$cuda_ver" + +if [ "$cuda_major" -ge 12 ]; then + conda install -c conda-forge "cudnn=9.*" "numpy<2.3" +elif [ "$cuda_major" -eq 11 ]; then + conda install -c conda-forge "cudnn=8.*" "numpy<2.3" + pip install --force-reinstall 'ctranslate2==3.24.0' 'numpy<2.2' +else + echo "Error: Unsupported CUDA major version $cuda_major" >&2 + exit 1 +fi \ No newline at end of file diff --git a/tools/install_nemo.sh b/tools/install_nemo.sh new file mode 100644 index 0000000..431df50 --- /dev/null +++ b/tools/install_nemo.sh @@ -0,0 +1,5 @@ +#/bin/bash + +# NOTE(Haoran): Toolkit for nemo_wer + +pip install "nemo_toolkit[asr]" \ No newline at end of file diff --git a/versa/__init__.py b/versa/__init__.py index 3210249..46531d2 100644 --- a/versa/__init__.py +++ b/versa/__init__.py @@ -55,6 +55,18 @@ whisper_levenshtein_metric, whisper_wer_setup, ) +from versa.corpus_metrics.fwhisper_wer import ( + fwhisper_levenshtein_metric, + fwhisper_wer_setup, +) +from versa.corpus_metrics.nemo_wer import ( + nemo_levenshtein_metric, + nemo_wer_setup, +) +from versa.corpus_metrics.hubert_wer import ( + hubert_levenshtein_metric, + hubert_wer_setup, +) from versa.utterance_metrics.asr_matching import asr_match_metric, asr_match_setup from versa.utterance_metrics.audiobox_aesthetics_score import ( audiobox_aesthetics_score, diff --git a/versa/corpus_metrics/fwhisper_wer.py b/versa/corpus_metrics/fwhisper_wer.py new file mode 100644 index 0000000..64cc7e2 --- /dev/null +++ b/versa/corpus_metrics/fwhisper_wer.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Haoran Wang +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import logging + +import librosa +import numpy as np +import torch +from Levenshtein import opcodes + +try: + from faster_whisper import WhisperModel, BatchedInferencePipeline +except ImportError: + logging.warning( + "Faster-whisper is not properly installed. Please install following https://github.com/systran/faster-whisper" + ) + WhisperModel = None + +from espnet2.text.cleaner import TextCleaner + +TARGET_FS = 16000 + + +def fwhisper_wer_setup( + model_tag="default", beam_size=5, batch_size=1, compute_type="float32" ,text_cleaner="whisper_basic", use_gpu=True +): + if model_tag == "default": + model_tag = "large-v3" + device = "cuda" if use_gpu else "cpu" + if WhisperModel is None: + raise RuntimeError( + "Whisper WER is used for evaluation while faster-whisper is not installed" + ) + model_whisper = WhisperModel(model_tag, device=device, compute_type=compute_type) + if batch_size > 1: + model = BatchedInferencePipeline(model=model_whisper) + else: + model = model_whisper + textcleaner = TextCleaner(text_cleaner) + wer_utils = {"model": model, "cleaner": textcleaner, "beam_size": beam_size, "batch_size": batch_size, "compute_type": compute_type} + return wer_utils + + +def fwhisper_levenshtein_metric( + wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None +): + """Calculate the Levenshtein distance between ref and inf ASR results. + + Args: + wer_utils (dict): a utility dict for WER calculation. + including: faster-whisper model ("model"), text cleaner ("textcleaner"), + beam size ("beam size") and batch size ("batch_size") + pred_x (np.ndarray): test signal (time,) + ref_text (string): reference transcript + cache_pred_text (string): transcription from cache (previous modules) + fs (int): sampling rate in Hz + Returns: + ret (dict): ditionary containing occurrences of edit operations + """ + if cache_pred_text is not None: + inf_text = cache_pred_text + else: + if fs != TARGET_FS: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=TARGET_FS) + fs = TARGET_FS + with torch.no_grad(): + if wer_utils["batch_size"] > 1: + pred_x = pred_x.astype(getattr(np, wer_utils["compute_type"])) + inf_output, _ = wer_utils["model"].transcribe( + pred_x, beam_size=wer_utils["beam_size"], batch_size=wer_utils["batch_size"] + ) + inf_text = "".join(segment.text for segment in inf_output) + else: + inf_output, _ = wer_utils["model"].transcribe( + pred_x, beam_size=wer_utils["beam_size"] + ) + inf_text = "".join(segment.text for segment in inf_output) + + + ref_text = wer_utils["cleaner"](ref_text).strip() + pred_text = wer_utils["cleaner"](inf_text).strip() + + # process wer + ref_words = ref_text.strip().split() + pred_words = pred_text.strip().split() + ret = { + "fwhisper_hyp_text": pred_text, + "ref_text": ref_text, + "fwhisper_wer_delete": 0, + "fwhisper_wer_insert": 0, + "fwhisper_wer_replace": 0, + "fwhisper_wer_equal": 0, + } + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["fwhisper_wer_" + op] = ret["fwhisper_wer_" + op] + inf_et - inf_st + else: + ret["fwhisper_wer_" + op] = ret["fwhisper_wer_" + op] + ref_et - ref_st + total = ( + ret["fwhisper_wer_delete"] + + ret["fwhisper_wer_replace"] + + ret["fwhisper_wer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["fwhisper_wer_insert"] + + ret["fwhisper_wer_replace"] + + ret["fwhisper_wer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + # process cer + ref_words = [c for c in ref_text] + pred_words = [c for c in pred_text] + ret["fwhisper_cer_delete"] = 0 + ret["fwhisper_cer_insert"] = 0 + ret["fwhisper_cer_replace"] = 0 + ret["fwhisper_cer_equal"] = 0 + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["fwhisper_cer_" + op] = ret["fwhisper_cer_" + op] + inf_et - inf_st + else: + ret["fwhisper_cer_" + op] = ret["fwhisper_cer_" + op] + ref_et - ref_st + total = ( + ret["fwhisper_cer_delete"] + + ret["fwhisper_cer_replace"] + + ret["fwhisper_cer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["fwhisper_cer_insert"] + + ret["fwhisper_cer_replace"] + + ret["fwhisper_cer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + return ret + + +if __name__ == "__main__": + a = np.random.random(16000) + wer_utils = fwhisper_wer_setup() + print( + "metrics: {}".format( + fwhisper_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + ) + ) diff --git a/versa/corpus_metrics/hubert_wer.py b/versa/corpus_metrics/hubert_wer.py new file mode 100644 index 0000000..ffcd249 --- /dev/null +++ b/versa/corpus_metrics/hubert_wer.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Haoran Wang +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import logging + +import librosa +import numpy as np +import torch +from Levenshtein import opcodes + +try: + from transformers import Wav2Vec2Processor, HubertForCTC +except ImportError: + logging.warning( + "transformers is not properly installed." + ) + Wav2Vec2Processor = None + HubertForCTC = None + +from espnet2.text.cleaner import TextCleaner + +TARGET_FS = 16000 + + +def hubert_wer_setup( + model_tag="default", text_cleaner="whisper_basic", use_gpu=True +): + if model_tag == "default": + model_tag = "facebook/hubert-large-ls960-ft" + device = "cuda" if use_gpu else "cpu" + if Wav2Vec2Processor is None and HubertForCTC is None: + raise RuntimeError( + "Facebook's hubert WER is used for evaluation while transformers is not installed" + ) + processor = Wav2Vec2Processor.from_pretrained(model_tag) + model = HubertForCTC.from_pretrained(model_tag) + + textcleaner = TextCleaner(text_cleaner) + wer_utils = {"model": model, "processor": processor, "cleaner": textcleaner} + return wer_utils + + +def hubert_levenshtein_metric( + wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None +): + """Calculate the Levenshtein distance between ref and inf ASR results. + + Args: + wer_utils (dict): a utility dict for WER calculation. + including: hubert asr model ("model"), text cleaner ("textcleaner") + pred_x (np.ndarray): test signal (time,) + ref_text (string): reference transcript + cache_pred_text (string): transcription from cache (previous modules) + fs (int): sampling rate in Hz + Returns: + ret (dict): ditionary containing occurrences of edit operations + """ + if cache_pred_text is not None: + inf_text = cache_pred_text + else: + if fs != TARGET_FS: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=TARGET_FS) + fs = TARGET_FS + with torch.no_grad(): + input_values = wer_utils["processor"](pred_x, return_tensors="pt").input_values + logits = wer_utils["model"](input_values).logits + predicted_ids = torch.argmax(logits, dim=-1) + inf_text = wer_utils["processor"].decode(predicted_ids[0]) + + ref_text = wer_utils["cleaner"](ref_text).strip() + pred_text = wer_utils["cleaner"](inf_text).strip() + + # process wer + ref_words = ref_text.strip().split() + pred_words = pred_text.strip().split() + ret = { + "hubert_hyp_text": pred_text, + "ref_text": ref_text, + "hubert_wer_delete": 0, + "hubert_wer_insert": 0, + "hubert_wer_replace": 0, + "hubert_wer_equal": 0, + } + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["hubert_wer_" + op] = ret["hubert_wer_" + op] + inf_et - inf_st + else: + ret["hubert_wer_" + op] = ret["hubert_wer_" + op] + ref_et - ref_st + total = ( + ret["hubert_wer_delete"] + + ret["hubert_wer_replace"] + + ret["hubert_wer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["hubert_wer_insert"] + + ret["hubert_wer_replace"] + + ret["hubert_wer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + # process cer + ref_words = [c for c in ref_text] + pred_words = [c for c in pred_text] + ret.update( + hubert_cer_delete=0, + hubert_cer_insert=0, + hubert_cer_replace=0, + hubert_cer_equal=0, + ) + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["hubert_cer_" + op] = ret["hubert_cer_" + op] + inf_et - inf_st + else: + ret["hubert_cer_" + op] = ret["hubert_cer_" + op] + ref_et - ref_st + total = ( + ret["hubert_cer_delete"] + + ret["hubert_cer_replace"] + + ret["hubert_cer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["hubert_cer_insert"] + + ret["hubert_cer_replace"] + + ret["hubert_cer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + return ret + + +if __name__ == "__main__": + a = np.random.random(16000) + wer_utils = hubert_wer_setup() + print( + "metrics: {}".format( + hubert_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + ) + ) diff --git a/versa/corpus_metrics/nemo_wer.py b/versa/corpus_metrics/nemo_wer.py new file mode 100644 index 0000000..57d4c5b --- /dev/null +++ b/versa/corpus_metrics/nemo_wer.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Haoran Wang +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import logging + +import librosa +import numpy as np +import torch +from Levenshtein import opcodes + +try: + import nemo.collections.asr as nemo_asr +except ImportError: + logging.warning( + "NeMo is not properly installed. Please install following https://github.com/NVIDIA/NeMo" + ) + nemo_asr= None + +from espnet2.text.cleaner import TextCleaner + +TARGET_FS = 16000 + + +def nemo_wer_setup( + model_tag="default", text_cleaner="whisper_basic", use_gpu=True +): + if model_tag == "default": + model_tag = "nvidia/stt_en_conformer_transducer_xlarge" + device = "cuda" if use_gpu else "cpu" + if nemo_asr is None: + raise RuntimeError( + "NeMo WER is used for evaluation while NeMo is not installed" + ) + asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_tag) + textcleaner = TextCleaner(text_cleaner) + wer_utils = {"model": asr_model, "cleaner": textcleaner} + return wer_utils + + +def nemo_levenshtein_metric( + wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None +): + """Calculate the Levenshtein distance between ref and inf ASR results. + + Args: + wer_utils (dict): a utility dict for WER calculation. + including: nemo asr model ("model"), text cleaner ("textcleaner") + pred_x (np.ndarray): test signal (time,) + ref_text (string): reference transcript + cache_pred_text (string): transcription from cache (previous modules) + fs (int): sampling rate in Hz + Returns: + ret (dict): ditionary containing occurrences of edit operations + """ + if cache_pred_text is not None: + inf_text = cache_pred_text + else: + if fs != TARGET_FS: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=TARGET_FS) + fs = TARGET_FS + with torch.no_grad(): + inf_text = wer_utils["model"].transcribe( + audio=pred_x + )[0].text + + ref_text = wer_utils["cleaner"](ref_text).strip() + pred_text = wer_utils["cleaner"](inf_text).strip() + + # process wer + ref_words = ref_text.strip().split() + pred_words = pred_text.strip().split() + ret = { + "nemo_hyp_text": pred_text, + "ref_text": ref_text, + "nemo_wer_delete": 0, + "nemo_wer_insert": 0, + "nemo_wer_replace": 0, + "nemo_wer_equal": 0, + } + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["nemo_wer_" + op] = ret["nemo_wer_" + op] + inf_et - inf_st + else: + ret["nemo_wer_" + op] = ret["nemo_wer_" + op] + ref_et - ref_st + total = ( + ret["nemo_wer_delete"] + + ret["nemo_wer_replace"] + + ret["nemo_wer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["nemo_wer_insert"] + + ret["nemo_wer_replace"] + + ret["nemo_wer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + # process cer + ref_words = [c for c in ref_text] + pred_words = [c for c in pred_text] + ret.update( + nemo_cer_delete=0, + nemo_cer_insert=0, + nemo_cer_replace=0, + nemo_cer_equal=0, + ) + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["nemo_cer_" + op] = ret["nemo_cer_" + op] + inf_et - inf_st + else: + ret["nemo_cer_" + op] = ret["nemo_cer_" + op] + ref_et - ref_st + total = ( + ret["nemo_cer_delete"] + + ret["nemo_cer_replace"] + + ret["nemo_cer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["nemo_cer_insert"] + + ret["nemo_cer_replace"] + + ret["nemo_cer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + return ret + + +if __name__ == "__main__": + a = np.random.random(16000) + wer_utils = nemo_wer_setup() + print( + "metrics: {}".format( + nemo_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + ) + ) diff --git a/versa/metrics.py b/versa/metrics.py index 067b6d2..dd23572 100644 --- a/versa/metrics.py +++ b/versa/metrics.py @@ -31,6 +31,9 @@ "espnet_hyp_text", "owsm_hyp_text", "whisper_hyp_text", + "fwhisper_hyp_text", + "nemo_hyp_text", + "hubert_hyp_text" ] NUM_METRIC = [ @@ -112,14 +115,44 @@ "owsm_cer_equal", "whisper_wer", "whisper_wer_delete", - "espnet_wer_insert", - "espnet_wer_replace", - "espnet_wer_equal", + "whisper_wer_insert", + "whisper_wer_replace", + "whisper_wer_equal", "whisper_cer", "whisper_cer_delete", - "espnet_cer_insert", - "espnet_cer_replace", - "espnet_cer_equal", + "whisper_cer_insert", + "whisper_cer_replace", + "whisper_cer_equal", + "fwhisper_wer", + "fwhisper_wer_delete", + "fwhisper_wer_insert", + "fwhisper_wer_replace", + "fwhisper_wer_equal", + "fwhisper_cer", + "fwhisper_cer_delete", + "fwhisper_cer_insert", + "fwhisper_cer_replace", + "fwhisper_cer_equal", + "nemo_wer", + "nemo_wer_delete", + "nemo_wer_insert", + "nemo_wer_replace", + "nemo_wer_equal", + "nemo_cer", + "nemo_cer_delete", + "nemo_cer_insert", + "nemo_cer_replace", + "nemo_cer_equal", + "hubert_wer", + "hubert_wer_delete", + "hubert_wer_insert", + "hubert_wer_replace", + "hubert_wer_equal", + "hubert_cer", + "hubert_cer_delete", + "hubert_cer_insert", + "hubert_cer_replace", + "hubert_cer_equal", "emotion_similarity", "spk_similarity", "nomad", diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index fd74a31..c843f98 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -361,7 +361,60 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal "args": args_cache, } logging.info("Initiate Whisper WER calculation successfully") + elif config["name"] == "fwhisper_wer": + if not use_gt_text: + logging.warning("Cannot use fwhisper_wer because no gt text is provided") + continue + + logging.info("Loading fwhisper_wer metric with reference text") + from versa import fwhisper_levenshtein_metric, fwhisper_wer_setup + + score_modules["fwhisper_wer"] = { + "module": fwhisper_levenshtein_metric, + "args": fwhisper_wer_setup( + model_tag=config.get("model_tag", "default"), + beam_size=config.get("beam_size", 1), + batch_size=config.get("batch_size", 1), + compute_type=config.get("compute_type", "float32"), + text_cleaner=config.get("text_cleaner", "whisper_basic"), + use_gpu=use_gpu, + ), + } + logging.info("Initiate fwhisper WER calculation successfully") + elif config["name"] == "nemo_wer": + if not use_gt_text: + logging.warning("Cannot use nemo_wer because no gt text is provided") + continue + + logging.info("Loading nemo_wer metric with reference text") + from versa import nemo_levenshtein_metric, nemo_wer_setup + score_modules["nemo_wer"] = { + "module": nemo_levenshtein_metric, + "args": nemo_wer_setup( + model_tag=config.get("model_tag", "default"), + text_cleaner=config.get("text_cleaner", "whisper_basic"), + use_gpu=use_gpu, + ), + } + logging.info("Initiate NeMo WER calculation successfully") + elif config["name"] == "hubert_wer": + if not use_gt_text: + logging.warning("Cannot use hubert_wer because no gt text is provided") + continue + + logging.info("Loading hubert_wer metric with reference text") + from versa import hubert_levenshtein_metric, hubert_wer_setup + + score_modules["hubert_wer"] = { + "module": hubert_levenshtein_metric, + "args": hubert_wer_setup( + model_tag=config.get("model_tag", "default"), + text_cleaner=config.get("text_cleaner", "whisper_basic"), + use_gpu=use_gpu, + ), + } + logging.info("Initiate hubert WER calculation successfully") elif config["name"] == "scoreq_ref": if not use_gt: logging.warning("Cannot use scoreq_ref because no gt audio is provided") @@ -1001,7 +1054,7 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None): score = score_modules[key]["module"]( score_modules[key]["model"], gen_wav, gt_wav, gen_sr ) - elif key == "espnet_wer" or key == "owsm_wer" or key == "whisper_wer": + elif key in ["espnet_wer", "owsm_wer", "whisper_wer", "fwhisper_wer", "nemo_wer", "hubert_wer"]: score = score_modules[key]["module"]( score_modules[key]["args"], gen_wav, @@ -1143,7 +1196,7 @@ def list_scoring( # Step2: load reference (gt) speech and conduct basic checks if gt_files is not None: - if key not in gen_files.keys(): + if key not in gt_files.keys(): logging.warning( "key {} not found in ground truth files though provided, skipping".format( key