Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/supported_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ We include x mark if the metric is auto-installed in versa.
| 53 | |SIG-MOS | sigmos | {SIGMOS_COL, SIGMOS_DISC, SIGMOS_LOUD, SIGMOS_REVERB, SIGMOS_SIG, SIGMOS_OVRL} | [sigmos](https://github.com/microsoft/SIG-Challenge/tree/main/ICASSP2024/sigmos) |[paper](https://arxiv.org/pdf/2309.07385) |
| 54 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) |
| 55 | x | Singing voice MOS | pseudo_mos | singmos_pro |[singmos](https://github.com/South-Twilight/SingMOS) | [paper](https://arxiv.org/abs/2510.01812) |
| 56 | | Multivariate Probabilistic Assessment of Speech Quality | multigauss | multigauss_{mos,noi,col,dis,loud} | [MultiGauss](https://github.com/fcumlin/MultiGauss) | [paper](https://arxiv.org/abs/2506.04890) |


### Dependent Metrics
|Number| Auto-Install | Metric Name (Auto-Install) | Key in config | Key in report | Code Source | References |
Expand Down
4 changes: 4 additions & 0 deletions egs/separate_metrics/multigauss.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Multivariate Probabilistic Assessment of Speech Quality (MultiGauss)

- name: multigauss
model_tag: probabilistic
50 changes: 50 additions & 0 deletions test/test_pipeline/test_multigauss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging
import math
import os

import yaml

from versa.scorer_shared import (
find_files,
list_scoring,
load_score_modules,
)


def info_update():

# find files
if os.path.isdir("test/test_samples/test2"):
gen_files = find_files("test/test_samples/test2")

logging.info("The number of utterances = %d" % len(gen_files))

with open("egs/separate_metrics/multigauss.yaml", "r", encoding="utf-8") as f:
score_config = yaml.full_load(f)

score_modules = load_score_modules(
score_config,
use_gt=False,
use_gpu=False,
)

assert len(score_config) > 0, "no scoring function is provided"

score_info = list_scoring(
gen_files, score_modules, output_file=None, io="soundfile"
)
print(score_info)
if (
len(score_info) > 0
and "multigauss_mos" in score_info[0]
and "multigauss_noi" in score_info[0]
and "multigauss_col" in score_info[0]
and "multigauss_dis" in score_info[0]
and "multigauss_loud" in score_info[0]
and "multigauss_covariance" in score_info[0]
):
print("check successful", flush=True)


if __name__ == "__main__":
info_update()
6 changes: 6 additions & 0 deletions tools/install_multigauss.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

## cloning the MultiGauss repo into the checkpoint folder
tools_dir=$(dirname $(realpath $0))
git clone https://github.com/fcumlin/MultiGauss.git $tools_dir/checkpoints/multigauss
pip install gin-config
2 changes: 2 additions & 0 deletions versa/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,4 +216,6 @@
"arecho_wer",
"arecho_cer",
"arecho_nisqa_real_mos",
"multigauss_mean",
"multigauss_covariance",
]
21 changes: 21 additions & 0 deletions versa/scorer_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,23 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
}
logging.info("Initiate ARECHO no-reference evaluation successfully.")

elif config["name"] == "multigauss":
logging.info("Loading MultiGauss model...")
from versa.utterance_metrics.multigauss import (
multigauss_model_setup,
multigauss_metric,
)

multigauss_model = multigauss_model_setup(
model_tag=config.get("model_tag", "probabilistic"),
use_gpu=use_gpu,
)
score_modules["multigauss"] = {
"module": multigauss_metric,
"model": multigauss_model,
}
logging.info("Initiate MultiGauss evaluation successfully.")

return score_modules


Expand Down Expand Up @@ -1263,6 +1280,10 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
score = score_modules[key]["module"](
score_modules[key]["model"], gen_wav, gen_sr
)
elif key == "multigauss":
score = score_modules[key]["module"](
score_modules[key]["model"], gen_wav, gen_sr
)
else:
raise NotImplementedError(
f"Not supported metrics: {key}, check egs/separate_metrics/README.md for supported metrics"
Expand Down
157 changes: 157 additions & 0 deletions versa/utterance_metrics/multigauss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/usr/bin/env python3

# Copyright 2025 Jionghao Han
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
#
# This file includes code adapted from the MultiGauss project:
# https://github.com/fcumlin/MultiGauss
# Copyright (c) 2025 Fredrik Cumlin
# Licensed under the MIT License


r"""
Notes from the MultiGauss project (Fredrik Cumlin):
The model operates at 16 kHz sample rate and on signals of 10 s duration, hence,
all audio is resampled to 16 kHz and repeated or cropped to 10 s before
processing. Note that the sample rate implies that no energy with frequencies
above 8 kHz are seen by the model.
"""

import logging

logger = logging.getLogger(__name__)

import sys
from pathlib import Path
import librosa
import numpy as np
import torch
import torchaudio

MULTIGAUSS_DIR = (
Path(__file__).parent.parent.parent / "tools" / "checkpoints" / "multigauss"
)
print(f"MULTIGAUSS_DIR: {MULTIGAUSS_DIR}")
try:
import gin

sys.path.append(str(MULTIGAUSS_DIR))
import model as model_lib
from train import TrainingLoop
except ImportError:
raise ImportError(
"MultiGauss is not set up. Please install the package via "
"`tools/install_multigauss.sh`"
)


def _repeat_and_crop_to_length(
waveform: torch.Tensor,
target_length: int = 160_000,
) -> torch.Tensor:
"""Repeates or crops the waveform to give it the target length."""
current_length = waveform.shape[-1]
if current_length < target_length:
num_repeats = target_length // current_length + 1
waveform = waveform.repeat(1, num_repeats)
return waveform[:, :target_length]


def multigauss_model_setup(
model_tag="probabilistic", cache_dir="versa_cache", use_gpu=False
):
"""Setup multigauss model.

Args:
model_tag (str): Model tag. Defaults to "probabilistic". Can be "probabilistic" or "non_probabilistic".
cache_dir (str): Cache directory. Defaults to "versa_cache".
use_gpu (bool, optional): Whether to use GPU. Defaults to False.

Returns:
models: The loaded models.
"""
device = "cuda" if use_gpu else "cpu"
model_folder = MULTIGAUSS_DIR / "runs" / model_tag
print(f"Loading model from {model_folder}")
gin.clear_config()
gin.external_configurable(TrainingLoop)
gin.parse_config_file(model_folder / "config.gin", skip_unknown=True)
ssl_model_layer = gin.query_parameter("TrainingLoop.ssl_layer")
bundle = torchaudio.pipelines.WAV2VEC2_XLSR_2B
ssl_model = bundle.get_model(
dl_kwargs=dict(model_dir=str(Path(cache_dir) / "torchaudio"))
).to(device=device)
ssl_model.eval()
ssl_model_extract = lambda x: ssl_model.extract_features(x)[0][ssl_model_layer]
multigauss_model = model_lib.ProjectionHead(in_shape=(1920, 499))
state_dict = torch.load(
model_folder / "model_best_state_dict.pt",
map_location=device,
weights_only=True,
)
multigauss_model.load_state_dict(state_dict)
multigauss_model = multigauss_model.to(device=device)
multigauss_model.eval()
return {
"ssl_model_extract": ssl_model_extract,
"multigauss_model": multigauss_model,
"model_tag": model_tag,
"device": device,
}


def multigauss_metric(models, pred_x, fs):
"""Calculate multigauss score for audio.

Args:
models (dict): The loaded models.
pred_x (np.ndarray): Audio signal.
fs (int): Sampling rate.

Returns:
dict: Dictionary containing the multigauss score.
"""
pred_x = torch.from_numpy(pred_x).float()
if fs != 16000:
pred_x = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(pred_x)
pred_x = _repeat_and_crop_to_length(
pred_x,
target_length=160_000, # Training was done with 10 s of audio (16 kHz).
)

with torch.no_grad():
feature = (
models["ssl_model_extract"](pred_x.to(device=models["device"])).squeeze().T
)
if models["model_tag"] == "probabilistic":
mean_prediction, covariance_prediction = models["multigauss_model"](
feature.unsqueeze(0)
)
else:
mean_prediction = models["multigauss_model"](feature.unsqueeze(0))
covariance_prediction = None
result = {
"multigauss_mos": mean_prediction[0][0].item(),
"multigauss_noi": mean_prediction[0][1].item(),
"multigauss_col": mean_prediction[0][2].item(),
"multigauss_dis": mean_prediction[0][3].item(),
"multigauss_loud": mean_prediction[0][4].item(),
}
if covariance_prediction is not None:
result["multigauss_covariance"] = (
covariance_prediction[0].cpu().numpy()
) # ["mos", "noi", "col", "dis", "loud"]
return result


if __name__ == "__main__":
a = np.random.random(16000)
model = multigauss_model_setup(
model_tag="probabilistic", use_gpu=True if torch.cuda.is_available() else False
)
print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}")
model = multigauss_model_setup(
model_tag="non_probabilistic",
use_gpu=True if torch.cuda.is_available() else False,
)
print(f"MultiGauss metrics: {multigauss_metric(model, a, 16000)}")
Loading