diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md index 7c4ef0f..db36bf5 100644 --- a/docs/supported_metrics.md +++ b/docs/supported_metrics.md @@ -60,6 +60,8 @@ We include x mark if the metric is auto-installed in versa. | 53 | |SIG-MOS | sigmos | {SIGMOS_COL, SIGMOS_DISC, SIGMOS_LOUD, SIGMOS_REVERB, SIGMOS_SIG, SIGMOS_OVRL} | [sigmos](https://github.com/microsoft/SIG-Challenge/tree/main/ICASSP2024/sigmos) |[paper](https://arxiv.org/pdf/2309.07385) | | 54 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) | | 55 | x | Singing voice MOS | pseudo_mos | singmos_pro |[singmos](https://github.com/South-Twilight/SingMOS) | [paper](https://arxiv.org/abs/2510.01812) | +| 56 | | Multivariate Probabilistic Assessment of Speech Quality | multigauss | multigauss_{mos,noi,col,dis,loud} | [MultiGauss](https://github.com/fcumlin/MultiGauss) | [paper](https://arxiv.org/abs/2506.04890) | + ### Dependent Metrics |Number| Auto-Install | Metric Name (Auto-Install) | Key in config | Key in report | Code Source | References | diff --git a/egs/separate_metrics/multigauss.yaml b/egs/separate_metrics/multigauss.yaml new file mode 100644 index 0000000..7d73e05 --- /dev/null +++ b/egs/separate_metrics/multigauss.yaml @@ -0,0 +1,4 @@ +# Multivariate Probabilistic Assessment of Speech Quality (MultiGauss) + +- name: multigauss + model_tag: probabilistic \ No newline at end of file diff --git a/test/test_pipeline/test_multigauss.py b/test/test_pipeline/test_multigauss.py new file mode 100644 index 0000000..76734d5 --- /dev/null +++ b/test/test_pipeline/test_multigauss.py @@ -0,0 +1,50 @@ +import logging +import math +import os + +import yaml + +from versa.scorer_shared import ( + find_files, + list_scoring, + load_score_modules, +) + + +def info_update(): + + # find files + if os.path.isdir("test/test_samples/test2"): + gen_files = find_files("test/test_samples/test2") + + logging.info("The number of utterances = %d" % len(gen_files)) + + with open("egs/separate_metrics/multigauss.yaml", "r", encoding="utf-8") as f: + score_config = yaml.full_load(f) + + score_modules = load_score_modules( + score_config, + use_gt=False, + use_gpu=False, + ) + + assert len(score_config) > 0, "no scoring function is provided" + + score_info = list_scoring( + gen_files, score_modules, output_file=None, io="soundfile" + ) + print(score_info) + if ( + len(score_info) > 0 + and "multigauss_mos" in score_info[0] + and "multigauss_noi" in score_info[0] + and "multigauss_col" in score_info[0] + and "multigauss_dis" in score_info[0] + and "multigauss_loud" in score_info[0] + and "multigauss_covariance" in score_info[0] + ): + print("check successful", flush=True) + + +if __name__ == "__main__": + info_update() diff --git a/tools/install_multigauss.sh b/tools/install_multigauss.sh new file mode 100755 index 0000000..6828843 --- /dev/null +++ b/tools/install_multigauss.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +## cloning the MultiGauss repo into the checkpoint folder +tools_dir=$(dirname $(realpath $0)) +git clone https://github.com/fcumlin/MultiGauss.git $tools_dir/checkpoints/multigauss +pip install gin-config diff --git a/versa/metrics.py b/versa/metrics.py index 1e86b19..0acc2ed 100644 --- a/versa/metrics.py +++ b/versa/metrics.py @@ -216,4 +216,6 @@ "arecho_wer", "arecho_cer", "arecho_nisqa_real_mos", + "multigauss_mean", + "multigauss_covariance", ] diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index cecd525..d1014b0 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -1029,6 +1029,23 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal } logging.info("Initiate ARECHO no-reference evaluation successfully.") + elif config["name"] == "multigauss": + logging.info("Loading MultiGauss model...") + from versa.utterance_metrics.multigauss import ( + multigauss_model_setup, + multigauss_metric, + ) + + multigauss_model = multigauss_model_setup( + model_tag=config.get("model_tag", "probabilistic"), + use_gpu=use_gpu, + ) + score_modules["multigauss"] = { + "module": multigauss_metric, + "model": multigauss_model, + } + logging.info("Initiate MultiGauss evaluation successfully.") + return score_modules @@ -1263,6 +1280,10 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None): score = score_modules[key]["module"]( score_modules[key]["model"], gen_wav, gen_sr ) + elif key == "multigauss": + score = score_modules[key]["module"]( + score_modules[key]["model"], gen_wav, gen_sr + ) else: raise NotImplementedError( f"Not supported metrics: {key}, check egs/separate_metrics/README.md for supported metrics" diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py new file mode 100644 index 0000000..e5d524f --- /dev/null +++ b/versa/utterance_metrics/multigauss.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Jionghao Han +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# +# This file includes code adapted from the MultiGauss project: +# https://github.com/fcumlin/MultiGauss +# Copyright (c) 2025 Fredrik Cumlin +# Licensed under the MIT License + + +r""" +Notes from the MultiGauss project (Fredrik Cumlin): +The model operates at 16 kHz sample rate and on signals of 10 s duration, hence, +all audio is resampled to 16 kHz and repeated or cropped to 10 s before +processing. Note that the sample rate implies that no energy with frequencies +above 8 kHz are seen by the model. +""" + +import logging + +logger = logging.getLogger(__name__) + +import sys +from pathlib import Path +import librosa +import numpy as np +import torch +import torchaudio + +MULTIGAUSS_DIR = ( + Path(__file__).parent.parent.parent / "tools" / "checkpoints" / "multigauss" +) +print(f"MULTIGAUSS_DIR: {MULTIGAUSS_DIR}") +try: + import gin + + sys.path.append(str(MULTIGAUSS_DIR)) + import model as model_lib + from train import TrainingLoop +except ImportError: + raise ImportError( + "MultiGauss is not set up. Please install the package via " + "`tools/install_multigauss.sh`" + ) + + +def _repeat_and_crop_to_length( + waveform: torch.Tensor, + target_length: int = 160_000, +) -> torch.Tensor: + """Repeates or crops the waveform to give it the target length.""" + current_length = waveform.shape[-1] + if current_length < target_length: + num_repeats = target_length // current_length + 1 + waveform = waveform.repeat(1, num_repeats) + return waveform[:, :target_length] + + +def multigauss_model_setup( + model_tag="probabilistic", cache_dir="versa_cache", use_gpu=False +): + """Setup multigauss model. + + Args: + model_tag (str): Model tag. Defaults to "probabilistic". Can be "probabilistic" or "non_probabilistic". + cache_dir (str): Cache directory. Defaults to "versa_cache". + use_gpu (bool, optional): Whether to use GPU. Defaults to False. + + Returns: + models: The loaded models. + """ + device = "cuda" if use_gpu else "cpu" + model_folder = MULTIGAUSS_DIR / "runs" / model_tag + print(f"Loading model from {model_folder}") + gin.clear_config() + gin.external_configurable(TrainingLoop) + gin.parse_config_file(model_folder / "config.gin", skip_unknown=True) + ssl_model_layer = gin.query_parameter("TrainingLoop.ssl_layer") + bundle = torchaudio.pipelines.WAV2VEC2_XLSR_2B + ssl_model = bundle.get_model( + dl_kwargs=dict(model_dir=str(Path(cache_dir) / "torchaudio")) + ).to(device=device) + ssl_model.eval() + ssl_model_extract = lambda x: ssl_model.extract_features(x)[0][ssl_model_layer] + multigauss_model = model_lib.ProjectionHead(in_shape=(1920, 499)) + state_dict = torch.load( + model_folder / "model_best_state_dict.pt", + map_location=device, + weights_only=True, + ) + multigauss_model.load_state_dict(state_dict) + multigauss_model = multigauss_model.to(device=device) + multigauss_model.eval() + return { + "ssl_model_extract": ssl_model_extract, + "multigauss_model": multigauss_model, + "model_tag": model_tag, + "device": device, + } + + +def multigauss_metric(models, pred_x, fs): + """Calculate multigauss score for audio. + + Args: + models (dict): The loaded models. + pred_x (np.ndarray): Audio signal. + fs (int): Sampling rate. + + Returns: + dict: Dictionary containing the multigauss score. + """ + pred_x = torch.from_numpy(pred_x).float() + if fs != 16000: + pred_x = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(pred_x) + pred_x = _repeat_and_crop_to_length( + pred_x, + target_length=160_000, # Training was done with 10 s of audio (16 kHz). + ) + + with torch.no_grad(): + feature = ( + models["ssl_model_extract"](pred_x.to(device=models["device"])).squeeze().T + ) + if models["model_tag"] == "probabilistic": + mean_prediction, covariance_prediction = models["multigauss_model"]( + feature.unsqueeze(0) + ) + else: + mean_prediction = models["multigauss_model"](feature.unsqueeze(0)) + covariance_prediction = None + result = { + "multigauss_mos": mean_prediction[0][0].item(), + "multigauss_noi": mean_prediction[0][1].item(), + "multigauss_col": mean_prediction[0][2].item(), + "multigauss_dis": mean_prediction[0][3].item(), + "multigauss_loud": mean_prediction[0][4].item(), + } + if covariance_prediction is not None: + result["multigauss_covariance"] = ( + covariance_prediction[0].cpu().numpy() + ) # ["mos", "noi", "col", "dis", "loud"] + return result + + +if __name__ == "__main__": + a = np.random.random(16000) + model = multigauss_model_setup( + model_tag="probabilistic", use_gpu=True if torch.cuda.is_available() else False + ) + print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}") + model = multigauss_model_setup( + model_tag="non_probabilistic", + use_gpu=True if torch.cuda.is_available() else False, + ) + print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}")