From 07f984c706e002ab48a5267f51a122235848b3e9 Mon Sep 17 00:00:00 2001 From: root <1214946890@qq.com> Date: Wed, 19 Nov 2025 16:23:43 +0000 Subject: [PATCH] feat: update singing mos with singmos_pro and singmos_v1 --- docs/supported_metrics.md | 5 ++-- egs/separate_metrics/pseudo_mos.yaml | 4 +-- egs/singing.yaml | 2 +- versa/utterance_metrics/pseudo_mos.py | 40 +++++++++++++-------------- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md index 5798ba3..7c4ef0f 100644 --- a/docs/supported_metrics.md +++ b/docs/supported_metrics.md @@ -13,8 +13,7 @@ We include x mark if the metric is auto-installed in versa. | 6 | x | PESQ in TorchAudio-Squim | squim_no_ref | torch_squim_pesq | [torch_squim](https://pytorch.org/audio/main/tutorials/squim_tutorial.html) | [paper](https://arxiv.org/abs/2304.01448) | | 7 | x | STOI in TorchAudio-Squim | squim_no_ref | torch_squim_stoi | [torch_squim](https://pytorch.org/audio/main/tutorials/squim_tutorial.html) | [paper](https://arxiv.org/abs/2304.01448) | | 8 | x | SI-SDR in TorchAudio-Squim | squim_no_ref | torch_squim_si_sdr | [torch_squim](https://pytorch.org/audio/main/tutorials/squim_tutorial.html) | [paper](https://arxiv.org/abs/2304.01448) | -| 9 | x | Singing voice MOS | singmos | singmos |[singmos](https://github.com/South-Twilight/SingMOS/tree/main) | [paper](https://arxiv.org/abs/2406.10911) | -| 9 | x | Singing voice MOS | singmos_v2 | singmos_v2 |[singmos](https://github.com/South-Twilight/SingMOS/tree/main) | [paper](https://arxiv.org/abs/2406.10911) | +| 9 | x | Singing voice MOS | pseudo_mos | singmos_v1 |[singmos](https://github.com/South-Twilight/SingMOS) | [paper](https://arxiv.org/abs/2406.10911) | | 10 | x | Sheet SSQA MOS Models | sheet_ssqa | sheet_ssqa |[Sheet](https://github.com/unilight/sheet/tree/main) | [paper](https://arxiv.org/abs/2411.03715) | | 11 | | UTMOSv2: UTokyo-SaruLab MOS Prediction System | utmosv2 | utmosv2 |[UTMOSv2](https://github.com/sarulab-speech/UTMOSv2) | [paper](https://arxiv.org/abs/2409.09305) | | 12 | | Speech Contrastive Regression for Quality Assessment without reference (ScoreQ) | scoreq_nr | scoreq_nr |[ScoreQ](https://github.com/ftshijt/scoreq/tree/main) | [paper](https://arxiv.org/pdf/2410.06675) | @@ -60,7 +59,7 @@ We include x mark if the metric is auto-installed in versa. | 52 | | WV-MOS (MOS score prediction by fine-tuned wav2vec2.0 model) | wvmos | wvmos | [wvmos](https://github.com/AndreevP/wvmos) | [paper](https://arxiv.org/abs/2203.13086) | | 53 | |SIG-MOS | sigmos | {SIGMOS_COL, SIGMOS_DISC, SIGMOS_LOUD, SIGMOS_REVERB, SIGMOS_SIG, SIGMOS_OVRL} | [sigmos](https://github.com/microsoft/SIG-Challenge/tree/main/ICASSP2024/sigmos) |[paper](https://arxiv.org/pdf/2309.07385) | | 54 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) | - +| 55 | x | Singing voice MOS | pseudo_mos | singmos_pro |[singmos](https://github.com/South-Twilight/SingMOS) | [paper](https://arxiv.org/abs/2510.01812) | ### Dependent Metrics |Number| Auto-Install | Metric Name (Auto-Install) | Key in config | Key in report | Code Source | References | diff --git a/egs/separate_metrics/pseudo_mos.yaml b/egs/separate_metrics/pseudo_mos.yaml index a87831c..0365281 100644 --- a/egs/separate_metrics/pseudo_mos.yaml +++ b/egs/separate_metrics/pseudo_mos.yaml @@ -12,9 +12,9 @@ fs: 16000 plcmos: fs: 16000 - singmos: + singmos_v1: fs: 16000 - singmos_v2: + singmos_pro: fs: 16000 dnsmos_pro_bvcc: fs: 16000 diff --git a/egs/singing.yaml b/egs/singing.yaml index b12c629..251b35c 100644 --- a/egs/singing.yaml +++ b/egs/singing.yaml @@ -7,7 +7,7 @@ cache_dir: versa_cache/audiobox - name: pseudo_mos - predictor_types: ["singmos", "singmos_v2"] + predictor_types: ["singmos_pro", "singmos_v1"] # An overall model on MOS-bench from Sheet toolkit # More info in https://github.com/unilight/sheet/tree/main diff --git a/versa/utterance_metrics/pseudo_mos.py b/versa/utterance_metrics/pseudo_mos.py index 3622d69..7402cb1 100644 --- a/versa/utterance_metrics/pseudo_mos.py +++ b/versa/utterance_metrics/pseudo_mos.py @@ -89,20 +89,20 @@ def pseudo_mos_setup( predictor_fs["plcmos"] = predictor_args["plcmos"]["fs"] elif predictor == "utmos" or predictor == "utmosv2": continue # already initialized - elif predictor == "singmos": + elif predictor == "singmos_v1": torch.hub.set_dir(cache_dir) singmos = torch.hub.load( - "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True + "South-Twilight/SingMOS:v1.1.1", "singmos_v1", trust_repo=True ).to(device) - predictor_dict["singmos"] = singmos - predictor_fs["singmos"] = 16000 - elif predictor == "singmos_v2": + predictor_dict["singmos_v1"] = singmos + predictor_fs["singmos_v1"] = 16000 + elif predictor == "singmos_pro": torch.hub.set_dir(cache_dir) singmos = torch.hub.load( - "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos_v2", trust_repo=True + "South-Twilight/SingMOS:v1.1.1", "singmos_pro", trust_repo=True ).to(device) - predictor_dict["singmos_v2"] = singmos - predictor_fs["singmos_v2"] = 16000 + predictor_dict["singmos_pro"] = singmos + predictor_fs["singmos_pro"] = 16000 elif predictor.startswith("dnsmos_pro_"): variant = predictor[len("dnsmos_pro_") :] model_path = Path(cache_dir) / f"dnsmos_pro_{variant}.pt" @@ -209,10 +209,10 @@ def pseudo_mos_metric(pred, fs, predictor_dict, predictor_fs, use_gpu=False): max_val = np.max(np.abs(pred_plcmos)) score = predictor_dict["plcmos"].run(pred_plcmos / max_val, sr=fs) scores.update(plcmos=score["plcmos"]) - elif predictor == "singmos": - if fs != predictor_fs["singmos"]: + elif predictor == "singmos_v1": + if fs != predictor_fs["singmos_v1"]: pred_singmos = librosa.resample( - pred, orig_sr=fs, target_sr=predictor_fs["singmos"] + pred, orig_sr=fs, target_sr=predictor_fs["singmos_v1"] ) else: pred_singmos = pred @@ -221,14 +221,14 @@ def pseudo_mos_metric(pred, fs, predictor_dict, predictor_fs, use_gpu=False): if use_gpu: pred_tensor = pred_tensor.to("cuda") length_tensor = length_tensor.to("cuda") - score = predictor_dict["singmos"](pred_tensor.float(), length_tensor)[ + score = predictor_dict["singmos_v1"](pred_tensor.float(), length_tensor)[ 0 ].item() - scores.update(singmos=score) - elif predictor == "singmos_v2": - if fs != predictor_fs["singmos_v2"]: + scores.update(singmos_v1=score) + elif predictor == "singmos_pro": + if fs != predictor_fs["singmos_pro"]: pred_singmos = librosa.resample( - pred, orig_sr=fs, target_sr=predictor_fs["singmos_v2"] + pred, orig_sr=fs, target_sr=predictor_fs["singmos_pro"] ) else: pred_singmos = pred @@ -237,10 +237,10 @@ def pseudo_mos_metric(pred, fs, predictor_dict, predictor_fs, use_gpu=False): if use_gpu: pred_tensor = pred_tensor.to("cuda") length_tensor = length_tensor.to("cuda") - score = predictor_dict["singmos_v2"](pred_tensor.float(), length_tensor)[ + score = predictor_dict["singmos_pro"](pred_tensor.float(), length_tensor)[ 0 ].item() - scores.update(singmos_v2=score) + scores.update(singmos_pro=score) elif predictor.startswith("dnsmos_pro_"): if fs != predictor_fs[predictor]: pred_dnsmos_pro = librosa.resample( @@ -305,8 +305,8 @@ def stft( "utmos", "dnsmos", "plcmos", - "singmos", - "singmos_v2", + "singmos_v1", + "singmos_pro", "dnsmos_pro_bvcc", "dnsmos_pro_nisqa", "dnsmos_pro_vcc2018",