From bb0668fdd5dda13783efea784707fa9e67d52504 Mon Sep 17 00:00:00 2001 From: jhan Date: Fri, 27 Jun 2025 22:02:27 -0400 Subject: [PATCH 01/10] add multigauss --- docs/supported_metrics.md | 1 + tools/install_multigauss.sh | 6 ++ versa/utterance_metrics/multigauss.py | 99 +++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100755 tools/install_multigauss.sh create mode 100644 versa/utterance_metrics/multigauss.py diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md index a15425b..ab1a1d8 100644 --- a/docs/supported_metrics.md +++ b/docs/supported_metrics.md @@ -56,6 +56,7 @@ We include x mark if the metric is auto-installed in versa. | 49 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_nisqa | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) | | 50 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_vcc2018 | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) | | 51 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) | +| 52 | | Multivariate Probabilistic Assessment of Speech Quality | multigauss | multigauss_{mos,noi,col,dis,loud} | [MultiGauss](https://github.com/fcumlin/MultiGauss) | [paper](https://arxiv.org/abs/2506.04890) | diff --git a/tools/install_multigauss.sh b/tools/install_multigauss.sh new file mode 100755 index 0000000..6828843 --- /dev/null +++ b/tools/install_multigauss.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +## cloning the MultiGauss repo into the checkpoint folder +tools_dir=$(dirname $(realpath $0)) +git clone https://github.com/fcumlin/MultiGauss.git $tools_dir/checkpoints/multigauss +pip install gin-config diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py new file mode 100644 index 0000000..efbfac0 --- /dev/null +++ b/versa/utterance_metrics/multigauss.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Jionghao Han +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# +# This file includes code adapted from the MultiGauss project: +# https://github.com/fcumlin/MultiGauss +# Copyright (c) 2025 Fredrik Cumlin +# Licensed under the MIT License + +import logging + +logger = logging.getLogger(__name__) + +import sys +from pathlib import Path +import librosa +import numpy as np +import torch +import torchaudio + + +try: + import gin + sys.path.append(str(Path(__file__).parent.parent.parent / "tools/checkpoints/multigauss")) + import model as model_lib + from train import TrainingLoop +except ImportError: + raise ImportError( + "MultiGauss is not set up. Please install the package via " + "`tools/install_multigauss.sh`" + ) + + +def multigauss_model_setup( + model_tag="probabilistic", cache_dir="versa_cache", use_gpu=False +): + """Setup multigauss model. + + Args: + model_tag (str): Model tag. Defaults to "probabilistic". Can be "probabilistic" or "non_probabilistic". + cache_dir (str): Cache directory. Defaults to "versa_cache". + use_gpu (bool, optional): Whether to use GPU. Defaults to False. + + Returns: + models: The loaded models. + """ + device = "cuda" if use_gpu else "cpu" + model_folder = Path(f"./tools/checkpoints/multigauss/runs/{model_tag}") + print(f"Loading model from {model_folder}") + gin.clear_config() + gin.external_configurable(TrainingLoop) + gin.parse_config_file(model_folder / "config.gin", skip_unknown=True) + ssl_model_layer = gin.query_parameter("TrainingLoop.ssl_layer") + bundle = torchaudio.pipelines.WAV2VEC2_XLSR_2B + ssl_model = bundle.get_model(dl_kwargs=dict(model_dir=str(Path(cache_dir) / "torchaudio"))).to(device=device) + ssl_model.eval() + ssl_model_extract = lambda x: ssl_model.extract_features(x)[0][ssl_model_layer] + multigauss_model = model_lib.ProjectionHead() + state_dict = torch.load( + model_folder / "model_best_state_dict.pt", + map_location=device, + weights_only=True + ) + multigauss_model.load_state_dict(state_dict) + multigauss_model.eval() + return { + "ssl_model_extract": ssl_model_extract, + "multigauss_model": multigauss_model, + } + + +def multigauss_metric(models, pred_x, fs): + """Calculate multigauss score for audio. + + Args: + models (dict): The loaded models. + pred_x (np.ndarray): Audio signal. + fs (int): Sampling rate. + + Returns: + dict: Dictionary containing the multigauss score. + """ + if fs != 16000: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=16000) + + with torch.no_grad(): + feature = models["ssl_model_extract"](pred_x).squeeze().T + print(f"{feature.shape=}") + mean_prediction, covariance_prediction = models["multigauss_model"](feature.unsqueeze(0)) + return { + "multigauss_mean": mean_prediction, + "multigauss_covariance": covariance_prediction, + } + +if __name__ == "__main__": + a = np.random.random(16000) + model = multigauss_model_setup(use_gpu=True if torch.cuda.is_available() else False) + print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}") From 4bf1eae2c35684f06b7dc5678f78af47b5e61582 Mon Sep 17 00:00:00 2001 From: jhan Date: Mon, 24 Nov 2025 18:17:42 -0500 Subject: [PATCH 02/10] multigauss.py bug fixes --- versa/utterance_metrics/multigauss.py | 56 ++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py index efbfac0..4b1aca7 100644 --- a/versa/utterance_metrics/multigauss.py +++ b/versa/utterance_metrics/multigauss.py @@ -8,6 +8,15 @@ # Copyright (c) 2025 Fredrik Cumlin # Licensed under the MIT License + +r""" +Notes from the MultiGauss project (Fredrik Cumlin): +The model operates at 16 kHz sample rate and on signals of 10 s duration, hence, +all audio is resampled to 16 kHz and repeated or cropped to 10 s before +processing. Note that the sample rate implies that no energy with frequencies +above 8 kHz are seen by the model. +""" + import logging logger = logging.getLogger(__name__) @@ -19,10 +28,14 @@ import torch import torchaudio - +MULTIGAUSS_DIR = ( + Path(__file__).parent.parent.parent / "tools" / "checkpoints" / "multigauss" +) +print(f"MULTIGAUSS_DIR: {MULTIGAUSS_DIR}") try: import gin - sys.path.append(str(Path(__file__).parent.parent.parent / "tools/checkpoints/multigauss")) + + sys.path.append(str(MULTIGAUSS_DIR)) import model as model_lib from train import TrainingLoop except ImportError: @@ -32,6 +45,18 @@ ) +def _repeat_and_crop_to_length( + waveform: torch.Tensor, + target_length: int = 160_000, +) -> torch.Tensor: + """Repeates or crops the waveform to give it the target length.""" + current_length = waveform.shape[-1] + if current_length < target_length: + num_repeats = target_length // current_length + 1 + waveform = waveform.repeat(1, num_repeats) + return waveform[:, :target_length] + + def multigauss_model_setup( model_tag="probabilistic", cache_dir="versa_cache", use_gpu=False ): @@ -46,27 +71,31 @@ def multigauss_model_setup( models: The loaded models. """ device = "cuda" if use_gpu else "cpu" - model_folder = Path(f"./tools/checkpoints/multigauss/runs/{model_tag}") + model_folder = MULTIGAUSS_DIR / "runs" / model_tag print(f"Loading model from {model_folder}") gin.clear_config() gin.external_configurable(TrainingLoop) gin.parse_config_file(model_folder / "config.gin", skip_unknown=True) ssl_model_layer = gin.query_parameter("TrainingLoop.ssl_layer") bundle = torchaudio.pipelines.WAV2VEC2_XLSR_2B - ssl_model = bundle.get_model(dl_kwargs=dict(model_dir=str(Path(cache_dir) / "torchaudio"))).to(device=device) + ssl_model = bundle.get_model( + dl_kwargs=dict(model_dir=str(Path(cache_dir) / "torchaudio")) + ).to(device=device) ssl_model.eval() ssl_model_extract = lambda x: ssl_model.extract_features(x)[0][ssl_model_layer] - multigauss_model = model_lib.ProjectionHead() + multigauss_model = model_lib.ProjectionHead(in_shape=(1920, 499)) state_dict = torch.load( model_folder / "model_best_state_dict.pt", map_location=device, - weights_only=True + weights_only=True, ) multigauss_model.load_state_dict(state_dict) + multigauss_model = multigauss_model.to(device=device) multigauss_model.eval() return { "ssl_model_extract": ssl_model_extract, "multigauss_model": multigauss_model, + "device": device, } @@ -81,18 +110,25 @@ def multigauss_metric(models, pred_x, fs): Returns: dict: Dictionary containing the multigauss score. """ + pred_x = torch.from_numpy(pred_x).float() if fs != 16000: - pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=16000) + pred_x = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(pred_x) + pred_x = _repeat_and_crop_to_length( + pred_x, + target_length=160_000, # Training was done with 10 s of audio (16 kHz). + ) with torch.no_grad(): - feature = models["ssl_model_extract"](pred_x).squeeze().T - print(f"{feature.shape=}") - mean_prediction, covariance_prediction = models["multigauss_model"](feature.unsqueeze(0)) + feature = models["ssl_model_extract"](pred_x.to(device=models["device"])).squeeze().T + mean_prediction, covariance_prediction = models["multigauss_model"]( + feature.unsqueeze(0) + ) return { "multigauss_mean": mean_prediction, "multigauss_covariance": covariance_prediction, } + if __name__ == "__main__": a = np.random.random(16000) model = multigauss_model_setup(use_gpu=True if torch.cuda.is_available() else False) From 305fa168a5b69ffa34a4a0e8a2f2ab4e94ae8cfc Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 00:53:59 -0500 Subject: [PATCH 03/10] Add MultiGauss model support for versa/bin/scorer.py --- egs/separate_metrics/multigauss.yaml | 4 ++++ versa/metrics.py | 2 ++ versa/scorer_shared.py | 17 +++++++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 egs/separate_metrics/multigauss.yaml diff --git a/egs/separate_metrics/multigauss.yaml b/egs/separate_metrics/multigauss.yaml new file mode 100644 index 0000000..7d73e05 --- /dev/null +++ b/egs/separate_metrics/multigauss.yaml @@ -0,0 +1,4 @@ +# Multivariate Probabilistic Assessment of Speech Quality (MultiGauss) + +- name: multigauss + model_tag: probabilistic \ No newline at end of file diff --git a/versa/metrics.py b/versa/metrics.py index 1e86b19..0acc2ed 100644 --- a/versa/metrics.py +++ b/versa/metrics.py @@ -216,4 +216,6 @@ "arecho_wer", "arecho_cer", "arecho_nisqa_real_mos", + "multigauss_mean", + "multigauss_covariance", ] diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index cecd525..6d999bd 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -1029,6 +1029,19 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal } logging.info("Initiate ARECHO no-reference evaluation successfully.") + elif config["name"] == "multigauss": + logging.info("Loading MultiGauss model...") + from versa.utterance_metrics.multigauss import multigauss_model_setup, multigauss_metric + multigauss_model = multigauss_model_setup( + model_tag=config.get("model_tag", "probabilistic"), + use_gpu=use_gpu, + ) + score_modules["multigauss"] = { + "module": multigauss_metric, + "model": multigauss_model, + } + logging.info("Initiate MultiGauss evaluation successfully.") + return score_modules @@ -1263,6 +1276,10 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None): score = score_modules[key]["module"]( score_modules[key]["model"], gen_wav, gen_sr ) + elif key == "multigauss": + score = score_modules[key]["module"]( + score_modules[key]["model"], gen_wav, gen_sr + ) else: raise NotImplementedError( f"Not supported metrics: {key}, check egs/separate_metrics/README.md for supported metrics" From 0342d8b664c3b8bd1d288ff2365cdca2be93d79e Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 00:55:01 -0500 Subject: [PATCH 04/10] black formatted --- versa/utterance_metrics/multigauss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py index 4b1aca7..5f366c4 100644 --- a/versa/utterance_metrics/multigauss.py +++ b/versa/utterance_metrics/multigauss.py @@ -119,7 +119,9 @@ def multigauss_metric(models, pred_x, fs): ) with torch.no_grad(): - feature = models["ssl_model_extract"](pred_x.to(device=models["device"])).squeeze().T + feature = ( + models["ssl_model_extract"](pred_x.to(device=models["device"])).squeeze().T + ) mean_prediction, covariance_prediction = models["multigauss_model"]( feature.unsqueeze(0) ) From f7b95524e980815a50249b0c82a89d948597f58e Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 01:05:56 -0500 Subject: [PATCH 05/10] black formatted --- versa/scorer_shared.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index 6d999bd..d1014b0 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -1031,7 +1031,11 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal elif config["name"] == "multigauss": logging.info("Loading MultiGauss model...") - from versa.utterance_metrics.multigauss import multigauss_model_setup, multigauss_metric + from versa.utterance_metrics.multigauss import ( + multigauss_model_setup, + multigauss_metric, + ) + multigauss_model = multigauss_model_setup( model_tag=config.get("model_tag", "probabilistic"), use_gpu=use_gpu, From 3acfff1b2d6921693bcf62d82fa2f964c6193ec5 Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 01:35:37 -0500 Subject: [PATCH 06/10] Refactor multigauss_metric to return individual metrics for MOS, NOI, COL, DIS, and LOUD, while maintaining covariance structure. --- versa/utterance_metrics/multigauss.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py index 5f366c4..aac94ba 100644 --- a/versa/utterance_metrics/multigauss.py +++ b/versa/utterance_metrics/multigauss.py @@ -126,8 +126,12 @@ def multigauss_metric(models, pred_x, fs): feature.unsqueeze(0) ) return { - "multigauss_mean": mean_prediction, - "multigauss_covariance": covariance_prediction, + "multigauss_mos": mean_prediction[0][0].item(), + "multigauss_noi": mean_prediction[0][1].item(), + "multigauss_col": mean_prediction[0][2].item(), + "multigauss_dis": mean_prediction[0][3].item(), + "multigauss_loud": mean_prediction[0][4].item(), + "multigauss_covariance": covariance_prediction[0].numpy(), # ["mos", "noi", "col", "dis", "loud"] } From 637de84596fd7f045ae09cf08f0f67e4ac2a3cc9 Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 01:36:32 -0500 Subject: [PATCH 07/10] Add test for MultiGauss scoring metrics in test/test_pipeline/test_multigauss.py --- test/test_pipeline/test_multigauss.py | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 test/test_pipeline/test_multigauss.py diff --git a/test/test_pipeline/test_multigauss.py b/test/test_pipeline/test_multigauss.py new file mode 100644 index 0000000..76734d5 --- /dev/null +++ b/test/test_pipeline/test_multigauss.py @@ -0,0 +1,50 @@ +import logging +import math +import os + +import yaml + +from versa.scorer_shared import ( + find_files, + list_scoring, + load_score_modules, +) + + +def info_update(): + + # find files + if os.path.isdir("test/test_samples/test2"): + gen_files = find_files("test/test_samples/test2") + + logging.info("The number of utterances = %d" % len(gen_files)) + + with open("egs/separate_metrics/multigauss.yaml", "r", encoding="utf-8") as f: + score_config = yaml.full_load(f) + + score_modules = load_score_modules( + score_config, + use_gt=False, + use_gpu=False, + ) + + assert len(score_config) > 0, "no scoring function is provided" + + score_info = list_scoring( + gen_files, score_modules, output_file=None, io="soundfile" + ) + print(score_info) + if ( + len(score_info) > 0 + and "multigauss_mos" in score_info[0] + and "multigauss_noi" in score_info[0] + and "multigauss_col" in score_info[0] + and "multigauss_dis" in score_info[0] + and "multigauss_loud" in score_info[0] + and "multigauss_covariance" in score_info[0] + ): + print("check successful", flush=True) + + +if __name__ == "__main__": + info_update() From 860ad628c17acedc80286c2be8bd375c901760c3 Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 01:38:38 -0500 Subject: [PATCH 08/10] Fix covariance calculation in multigauss_metric to use CPU tensor before conversion to numpy. --- versa/utterance_metrics/multigauss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py index aac94ba..7cfd0fc 100644 --- a/versa/utterance_metrics/multigauss.py +++ b/versa/utterance_metrics/multigauss.py @@ -131,7 +131,7 @@ def multigauss_metric(models, pred_x, fs): "multigauss_col": mean_prediction[0][2].item(), "multigauss_dis": mean_prediction[0][3].item(), "multigauss_loud": mean_prediction[0][4].item(), - "multigauss_covariance": covariance_prediction[0].numpy(), # ["mos", "noi", "col", "dis", "loud"] + "multigauss_covariance": covariance_prediction[0].cpu().numpy(), # ["mos", "noi", "col", "dis", "loud"] } From 57608d45496abdfe0b624d8996c7da6c57bb6645 Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 01:47:58 -0500 Subject: [PATCH 09/10] Fix non probabilistic model in multigauss_metric --- versa/utterance_metrics/multigauss.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py index 7cfd0fc..1fb03b8 100644 --- a/versa/utterance_metrics/multigauss.py +++ b/versa/utterance_metrics/multigauss.py @@ -95,6 +95,7 @@ def multigauss_model_setup( return { "ssl_model_extract": ssl_model_extract, "multigauss_model": multigauss_model, + "model_tag": model_tag, "device": device, } @@ -122,20 +123,28 @@ def multigauss_metric(models, pred_x, fs): feature = ( models["ssl_model_extract"](pred_x.to(device=models["device"])).squeeze().T ) - mean_prediction, covariance_prediction = models["multigauss_model"]( - feature.unsqueeze(0) - ) - return { + if models["model_tag"] == "probabilistic": + mean_prediction, covariance_prediction = models["multigauss_model"]( + feature.unsqueeze(0) + ) + else: + mean_prediction = models["multigauss_model"](feature.unsqueeze(0)) + covariance_prediction = None + result = { "multigauss_mos": mean_prediction[0][0].item(), "multigauss_noi": mean_prediction[0][1].item(), "multigauss_col": mean_prediction[0][2].item(), "multigauss_dis": mean_prediction[0][3].item(), "multigauss_loud": mean_prediction[0][4].item(), - "multigauss_covariance": covariance_prediction[0].cpu().numpy(), # ["mos", "noi", "col", "dis", "loud"] } + if covariance_prediction is not None: + result["multigauss_covariance"] = covariance_prediction[0].cpu().numpy() # ["mos", "noi", "col", "dis", "loud"] + return result if __name__ == "__main__": a = np.random.random(16000) - model = multigauss_model_setup(use_gpu=True if torch.cuda.is_available() else False) + model = multigauss_model_setup(model_tag="probabilistic", use_gpu=True if torch.cuda.is_available() else False) + print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}") + model = multigauss_model_setup(model_tag="non_probabilistic", use_gpu=True if torch.cuda.is_available() else False) print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}") From 403646b47d329deabbc2fad0c9cf89165e44bc43 Mon Sep 17 00:00:00 2001 From: jhan Date: Wed, 3 Dec 2025 02:07:12 -0500 Subject: [PATCH 10/10] black formatted --- versa/utterance_metrics/multigauss.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/versa/utterance_metrics/multigauss.py b/versa/utterance_metrics/multigauss.py index 1fb03b8..e5d524f 100644 --- a/versa/utterance_metrics/multigauss.py +++ b/versa/utterance_metrics/multigauss.py @@ -138,13 +138,20 @@ def multigauss_metric(models, pred_x, fs): "multigauss_loud": mean_prediction[0][4].item(), } if covariance_prediction is not None: - result["multigauss_covariance"] = covariance_prediction[0].cpu().numpy() # ["mos", "noi", "col", "dis", "loud"] + result["multigauss_covariance"] = ( + covariance_prediction[0].cpu().numpy() + ) # ["mos", "noi", "col", "dis", "loud"] return result if __name__ == "__main__": a = np.random.random(16000) - model = multigauss_model_setup(model_tag="probabilistic", use_gpu=True if torch.cuda.is_available() else False) + model = multigauss_model_setup( + model_tag="probabilistic", use_gpu=True if torch.cuda.is_available() else False + ) print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}") - model = multigauss_model_setup(model_tag="non_probabilistic", use_gpu=True if torch.cuda.is_available() else False) + model = multigauss_model_setup( + model_tag="non_probabilistic", + use_gpu=True if torch.cuda.is_available() else False, + ) print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}")