From 6d5d931dcd632961695c3ef81f58c4a432fe2b2d Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 24 Jun 2025 18:44:26 -0400 Subject: [PATCH 01/14] add description function of using text LLMs --- scripts/description/text_llm_description.py | 66 +++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 scripts/description/text_llm_description.py diff --git a/scripts/description/text_llm_description.py b/scripts/description/text_llm_description.py new file mode 100644 index 0000000..e4805d1 --- /dev/null +++ b/scripts/description/text_llm_description.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Module for text LLM description.""" + +import json +from tqdm.auto import tqdm + +def create_template(metrics: dict) -> str: + prompt = f""" + ## Background + You are a professional audio descriptor.\n\n + ## Task instruction + Describe the audio according to following predicted metrics and choose top-10 most important metrics according to your description.\n\n + ## Output Format + Provide the audio description, top-10 metric selection, reasoning for your selection, + You should respond in JSON format. First provide a one-sentence concise description for the audio in field ‘description‘, then your top-10 metrics selection in field ‘top-10 metrics‘ followed by a reason in the field 'reasoning'. For example, + ``` + {{"description": "", "top-10 metrics": < your top-10 metrics selection>, "reasoning": }} + ``` + ## Metrics:\n{metrics}\n\n + ## Answer:\n + """ + return prompt + +def describe_all(metrics_dict: dict, model_name: str, modules: dict) -> list: + """ + For each utt_id in metrics_dict, run the prompt through the appropriate + module (either tokenizer+model or pipeline) and return a list of results. + """ + results = [] + mod = modules[model_name]["args"] + for utt_id, metrics in tqdm(metrics_dict.items(), desc=f"Describing with {model_name}"): + prompt = create_template(metrics) + if "model" in mod: + # Qwen or Mistral-style + messages = [ + {"role": "system", "content": "You are a professional audio descriptor."}, + {"role": "user", "content": prompt} + ] + text = mod["tokenizer"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = mod["tokenizer"]([text], return_tensors="pt").to(mod["model"].device) + gen_ids = mod["model"].generate(**inputs, max_new_tokens=1024) + # strip off prompt + gen_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, gen_ids)] + response = mod["tokenizer"].batch_decode(gen_ids, skip_special_tokens=True)[0] + else: + # llama pipeline + messages = [ + {"role": "system", "content": "You are a professional audio descriptor."}, + {"role": "user", "content": prompt} + ] + response = mod["pipe"](messages, max_new_tokens=1024)[0]["generated_text"][-1]["content"] + + # clean & parse + clean = response.strip().strip("```json").strip("```").replace("\n", " ") + try: + obj = json.loads(clean) + obj["utt_id"] = utt_id + results.append(obj) + except json.JSONDecodeError: + # fallback: store raw text + results.append({"utt_id": utt_id, "error": "parse_failed", "raw": response}) + return results From 0fe5a5c318ca8c19d13c2bcebb451067e5b67f5d Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 24 Jun 2025 18:45:20 -0400 Subject: [PATCH 02/14] add a interpreter that can directly apply for the metric results from scorer --- versa/bin/interpreter.py | 112 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 versa/bin/interpreter.py diff --git a/versa/bin/interpreter.py b/versa/bin/interpreter.py new file mode 100644 index 0000000..d58ec15 --- /dev/null +++ b/versa/bin/interpreter.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Interpreter Interface for Speech Evaluation.""" + +import argparse +import logging +import json + +import torch +import yaml + +from versa.interpreter_shared import ( + metric_loader_setup, + load_interpreter_modules, +) +from scripts.description.text_llm_description import describe_all + + +def get_parser() -> argparse.Namespace: + """Get argument parser.""" + parser = argparse.ArgumentParser(description="Interpretation for Speech Evaluation Interface") + parser.add_argument( + "--score_output_file", + type=str, + default=None, + help="Path of directory of the score results.", + ) + parser.add_argument( + "--config", + required=True, + help="YAML with interpreter_config (list of model_name dicts)" + ) + parser.add_argument( + "--output_file", + required=True, + help="Where to dump the JSON descriptions" + ) + parser.add_argument( + "--use_gpu", type=bool, default=False, help="whether to use GPU if it can" + ) + parser.add_argument( + "--verbose", + default=1, + type=int, + help="Verbosity level. Higher is more logging.", + ) + parser.add_argument( + "--rank", + default=0, + type=int, + help="the overall rank in the batch processing, used to specify GPU rank", + ) + return parser + + +def main(): + args = get_parser().parse_args() + + # In case of using `local` backend, all GPU will be visible to all process. + if args.use_gpu: + gpu_rank = args.rank % torch.cuda.device_count() + torch.cuda.set_device(gpu_rank) + logging.info(f"using device: cuda:{gpu_rank}") + + # logging info + if args.verbose > 1: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + elif args.verbose > 0: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.warning("Skip DEBUG/INFO messages") + + metrics = metric_loader_setup(args.score_output_file) + logging.info("The number of utterances = %d" % len(metrics)) + + # 2) Load interpreter modules from YAML + with open(args.config) as cf: + cfg = yaml.safe_load(cf) + interpreter_modules = load_interpreter_modules( + cfg["interpreter_config"], + use_gpu=args.use_gpu, + ) + + # 3) Run description for each model + all_results = [] + for model_cfg in cfg["interpreter_config"]: + name = model_cfg["model_name"] + logging.info(f"Describing with {name}") + res = describe_all(metrics, name, interpreter_modules) + all_results.extend(res) + + # 4) Dump + with open(args.output_file, "w") as outf: + json.dump(all_results, outf, ensure_ascii=False, indent=2) + logging.info(f"Wrote descriptions to {args.output_file}") + + +if __name__ == "__main__": + main() From 576794b16101b730796a0a46fe936ad81188113e Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 24 Jun 2025 18:46:09 -0400 Subject: [PATCH 03/14] add some common functions for interpreter usage and future extension model usage --- versa/interpreter_shared.py | 75 +++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 versa/interpreter_shared.py diff --git a/versa/interpreter_shared.py b/versa/interpreter_shared.py new file mode 100644 index 0000000..e805325 --- /dev/null +++ b/versa/interpreter_shared.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import json +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch +from transformers import pipeline +from huggingface_hub import login + +def metric_loader_setup(score_output_file): + """ + Reads an scp-like file where each line is: + utt_id + Returns a dict mapping utt_id → metrics dict. + """ + data = {} + with open(score_output_file, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + utt_id, json_str = line.split(maxsplit=1) + data[utt_id] = json.loads(json_str) + return data + +def load_interpreter_modules(interpreter_config, use_gpu): + assert interpreter_config, "no interpreter function is provided" + interpreter_modules = {} + for config in interpreter_config: + print(config, flush=True) + if config["model_name"] == "Qwen/Qwen2.5-7B-Instruct": + model = AutoModelForCausalLM.from_pretrained( + config["model_name"], + torch_dtype="auto", + device_map="auto" if use_gpu else None, + ) + tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) + interpreter_modules[config["model_name"]] = { + "args": { + "model": model, + "tokenizer": tokenizer, + }, + } + elif config["model_name"] == "mistralai/Mistral-7B-Instruct-v0.3": + login(token=config["HF_TOKEN"]) + model = AutoModelForCausalLM.from_pretrained( + config["model_name"], + torch_dtype="auto", + device_map="auto" if use_gpu else None, + ) + tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) + interpreter_modules[config["model_name"]] = { + "args": { + "model": model, + "tokenizer": tokenizer, + }, + } + elif config["model_name"] == "meta-llama/Llama-3.1-8B-Instruct": + login(token=config["HF_TOKEN"]) + pipe = pipeline( + "text-generation", + model=config["model_name"], + torch_dtype=torch.bfloat16, + device_map="auto" if use_gpu else None, + ) + interpreter_modules[config["model_name"]] = { + "args": { + "pipe": pipe, + }, + } + else: + raise ValueError(f"Unsupported model_name: {config["model_name"]}") + return interpreter_modules \ No newline at end of file From fe93aeac1c926c2c52c97613ba47ff3e49f44057 Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 24 Jun 2025 18:50:18 -0400 Subject: [PATCH 04/14] run isort and black on the interpreter_shared.py --- versa/interpreter_shared.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/versa/interpreter_shared.py b/versa/interpreter_shared.py index e805325..1e23a3c 100644 --- a/versa/interpreter_shared.py +++ b/versa/interpreter_shared.py @@ -4,10 +4,11 @@ # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import json -from transformers import AutoModelForCausalLM, AutoTokenizer + import torch -from transformers import pipeline from huggingface_hub import login +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + def metric_loader_setup(score_output_file): """ @@ -25,6 +26,7 @@ def metric_loader_setup(score_output_file): data[utt_id] = json.loads(json_str) return data + def load_interpreter_modules(interpreter_config, use_gpu): assert interpreter_config, "no interpreter function is provided" interpreter_modules = {} @@ -71,5 +73,5 @@ def load_interpreter_modules(interpreter_config, use_gpu): }, } else: - raise ValueError(f"Unsupported model_name: {config["model_name"]}") - return interpreter_modules \ No newline at end of file + raise ValueError(f"Unsupported model_name: {config['model_name']}") + return interpreter_modules From 80684693a8435d41ff208c38f55adcaaa123643e Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 24 Jun 2025 18:51:18 -0400 Subject: [PATCH 05/14] run isort and black on the interpreter.py --- versa/bin/interpreter.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/versa/bin/interpreter.py b/versa/bin/interpreter.py index d58ec15..f8a4767 100644 --- a/versa/bin/interpreter.py +++ b/versa/bin/interpreter.py @@ -6,22 +6,20 @@ """Interpreter Interface for Speech Evaluation.""" import argparse -import logging import json +import logging import torch import yaml - -from versa.interpreter_shared import ( - metric_loader_setup, - load_interpreter_modules, -) from scripts.description.text_llm_description import describe_all +from versa.interpreter_shared import load_interpreter_modules, metric_loader_setup def get_parser() -> argparse.Namespace: """Get argument parser.""" - parser = argparse.ArgumentParser(description="Interpretation for Speech Evaluation Interface") + parser = argparse.ArgumentParser( + description="Interpretation for Speech Evaluation Interface" + ) parser.add_argument( "--score_output_file", type=str, @@ -31,12 +29,10 @@ def get_parser() -> argparse.Namespace: parser.add_argument( "--config", required=True, - help="YAML with interpreter_config (list of model_name dicts)" + help="YAML with interpreter_config (list of model_name dicts)", ) parser.add_argument( - "--output_file", - required=True, - help="Where to dump the JSON descriptions" + "--output_file", required=True, help="Where to dump the JSON descriptions" ) parser.add_argument( "--use_gpu", type=bool, default=False, help="whether to use GPU if it can" From 5696ef116d73cbb4b46784b3cbf5b4563fa7bdfd Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 24 Jun 2025 18:52:08 -0400 Subject: [PATCH 06/14] run isort and black on the text_llm_description.py --- scripts/description/text_llm_description.py | 39 +++++++++++++++------ 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/scripts/description/text_llm_description.py b/scripts/description/text_llm_description.py index e4805d1..2d4b9e6 100644 --- a/scripts/description/text_llm_description.py +++ b/scripts/description/text_llm_description.py @@ -6,8 +6,10 @@ """Module for text LLM description.""" import json + from tqdm.auto import tqdm + def create_template(metrics: dict) -> str: prompt = f""" ## Background @@ -25,6 +27,7 @@ def create_template(metrics: dict) -> str: """ return prompt + def describe_all(metrics_dict: dict, model_name: str, modules: dict) -> list: """ For each utt_id in metrics_dict, run the prompt through the appropriate @@ -32,27 +35,43 @@ def describe_all(metrics_dict: dict, model_name: str, modules: dict) -> list: """ results = [] mod = modules[model_name]["args"] - for utt_id, metrics in tqdm(metrics_dict.items(), desc=f"Describing with {model_name}"): + for utt_id, metrics in tqdm( + metrics_dict.items(), desc=f"Describing with {model_name}" + ): prompt = create_template(metrics) if "model" in mod: # Qwen or Mistral-style messages = [ - {"role": "system", "content": "You are a professional audio descriptor."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "You are a professional audio descriptor.", + }, + {"role": "user", "content": prompt}, ] - text = mod["tokenizer"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - inputs = mod["tokenizer"]([text], return_tensors="pt").to(mod["model"].device) + text = mod["tokenizer"].apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = mod["tokenizer"]([text], return_tensors="pt").to( + mod["model"].device + ) gen_ids = mod["model"].generate(**inputs, max_new_tokens=1024) # strip off prompt - gen_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, gen_ids)] - response = mod["tokenizer"].batch_decode(gen_ids, skip_special_tokens=True)[0] + gen_ids = [out[len(inp) :] for inp, out in zip(inputs.input_ids, gen_ids)] + response = mod["tokenizer"].batch_decode(gen_ids, skip_special_tokens=True)[ + 0 + ] else: # llama pipeline messages = [ - {"role": "system", "content": "You are a professional audio descriptor."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "You are a professional audio descriptor.", + }, + {"role": "user", "content": prompt}, ] - response = mod["pipe"](messages, max_new_tokens=1024)[0]["generated_text"][-1]["content"] + response = mod["pipe"](messages, max_new_tokens=1024)[0]["generated_text"][ + -1 + ]["content"] # clean & parse clean = response.strip().strip("```json").strip("```").replace("\n", " ") From 36ecd80a2dcfda3bf7cce74ad9588676aea281ec Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Fri, 29 Aug 2025 11:44:38 -0400 Subject: [PATCH 07/14] move to script folder and create a readme file --- scripts/description/README.md | 38 ++++++++ scripts/description/interpreter.py | 108 ++++++++++++++++++++++ scripts/description/interpreter_shared.py | 77 +++++++++++++++ 3 files changed, 223 insertions(+) create mode 100644 scripts/description/README.md create mode 100644 scripts/description/interpreter.py create mode 100644 scripts/description/interpreter_shared.py diff --git a/scripts/description/README.md b/scripts/description/README.md new file mode 100644 index 0000000..07d9ea5 --- /dev/null +++ b/scripts/description/README.md @@ -0,0 +1,38 @@ +# Speech Evaluation Interpreter + +This tool loads utterance-level metrics and uses LLM interpreters to generate natural-language descriptions. + +--- + +## Files +- `interpreter.py`: CLI entry point, loads config, metrics, runs interpreters, saves JSON. +- `interpreter_shared.py`: utilities for loading metrics and models. +- `text_llm_description.py`: **you implement** `describe_all(...)` to describe each utterance. + +--- + +## Example Input + +### `scores.scp` + +``` +utt_0001 {"SNR": 23.1, "WER": 0.08, "MOS": 4.2} +utt_0002 {"SNR": 12.7, "WER": 0.30, "MOS": 3.0} +``` + +### `egs/interpreter.yaml` +```yaml +interpreter_config: + - model_name: "Qwen/Qwen2.5-7B-Instruct" +``` + +## Run + +```bash +python interpreter.py \ + --config egs/interpreter.yaml \ + --score_output_file scores.scp \ + --output_file descriptions.json \ + --use_gpu False \ + --verbose 1 +``` \ No newline at end of file diff --git a/scripts/description/interpreter.py b/scripts/description/interpreter.py new file mode 100644 index 0000000..f34ffe5 --- /dev/null +++ b/scripts/description/interpreter.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Interpreter Interface for Speech Evaluation.""" + +import argparse +import json +import logging + +import torch +import yaml +from text_llm_description import describe_all +from interpreter_shared import load_interpreter_modules, metric_loader_setup + + +def get_parser() -> argparse.Namespace: + """Get argument parser.""" + parser = argparse.ArgumentParser( + description="Interpretation for Speech Evaluation Interface" + ) + parser.add_argument( + "--score_output_file", + type=str, + default=None, + help="Path of directory of the score results.", + ) + parser.add_argument( + "--config", + required=True, + help="YAML with interpreter_config (list of model_name dicts)", + ) + parser.add_argument( + "--output_file", required=True, help="Where to dump the JSON descriptions" + ) + parser.add_argument( + "--use_gpu", type=bool, default=False, help="whether to use GPU if it can" + ) + parser.add_argument( + "--verbose", + default=1, + type=int, + help="Verbosity level. Higher is more logging.", + ) + parser.add_argument( + "--rank", + default=0, + type=int, + help="the overall rank in the batch processing, used to specify GPU rank", + ) + return parser + + +def main(): + args = get_parser().parse_args() + + # In case of using `local` backend, all GPU will be visible to all process. + if args.use_gpu: + gpu_rank = args.rank % torch.cuda.device_count() + torch.cuda.set_device(gpu_rank) + logging.info(f"using device: cuda:{gpu_rank}") + + # logging info + if args.verbose > 1: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + elif args.verbose > 0: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.warning("Skip DEBUG/INFO messages") + + metrics = metric_loader_setup(args.score_output_file) + logging.info("The number of utterances = %d" % len(metrics)) + + # 2) Load interpreter modules from YAML + with open(args.config) as cf: + cfg = yaml.safe_load(cf) + interpreter_modules = load_interpreter_modules( + cfg["interpreter_config"], + use_gpu=args.use_gpu, + ) + + # 3) Run description for each model + all_results = [] + for model_cfg in cfg["interpreter_config"]: + name = model_cfg["model_name"] + logging.info(f"Describing with {name}") + res = describe_all(metrics, name, interpreter_modules) + all_results.extend(res) + + # 4) Dump + with open(args.output_file, "w") as outf: + json.dump(all_results, outf, ensure_ascii=False, indent=2) + logging.info(f"Wrote descriptions to {args.output_file}") + + +if __name__ == "__main__": + main() diff --git a/scripts/description/interpreter_shared.py b/scripts/description/interpreter_shared.py new file mode 100644 index 0000000..1e23a3c --- /dev/null +++ b/scripts/description/interpreter_shared.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import json + +import torch +from huggingface_hub import login +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + + +def metric_loader_setup(score_output_file): + """ + Reads an scp-like file where each line is: + utt_id + Returns a dict mapping utt_id → metrics dict. + """ + data = {} + with open(score_output_file, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + utt_id, json_str = line.split(maxsplit=1) + data[utt_id] = json.loads(json_str) + return data + + +def load_interpreter_modules(interpreter_config, use_gpu): + assert interpreter_config, "no interpreter function is provided" + interpreter_modules = {} + for config in interpreter_config: + print(config, flush=True) + if config["model_name"] == "Qwen/Qwen2.5-7B-Instruct": + model = AutoModelForCausalLM.from_pretrained( + config["model_name"], + torch_dtype="auto", + device_map="auto" if use_gpu else None, + ) + tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) + interpreter_modules[config["model_name"]] = { + "args": { + "model": model, + "tokenizer": tokenizer, + }, + } + elif config["model_name"] == "mistralai/Mistral-7B-Instruct-v0.3": + login(token=config["HF_TOKEN"]) + model = AutoModelForCausalLM.from_pretrained( + config["model_name"], + torch_dtype="auto", + device_map="auto" if use_gpu else None, + ) + tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) + interpreter_modules[config["model_name"]] = { + "args": { + "model": model, + "tokenizer": tokenizer, + }, + } + elif config["model_name"] == "meta-llama/Llama-3.1-8B-Instruct": + login(token=config["HF_TOKEN"]) + pipe = pipeline( + "text-generation", + model=config["model_name"], + torch_dtype=torch.bfloat16, + device_map="auto" if use_gpu else None, + ) + interpreter_modules[config["model_name"]] = { + "args": { + "pipe": pipe, + }, + } + else: + raise ValueError(f"Unsupported model_name: {config['model_name']}") + return interpreter_modules From 8a5d2d57ae89432c5ad704df7e48b6a2a11660f7 Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Fri, 29 Aug 2025 11:45:26 -0400 Subject: [PATCH 08/14] create an example configuration file for interpretation --- egs/interpreter.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 egs/interpreter.yaml diff --git a/egs/interpreter.yaml b/egs/interpreter.yaml new file mode 100644 index 0000000..6f42eea --- /dev/null +++ b/egs/interpreter.yaml @@ -0,0 +1,8 @@ +# interpreter example yaml config +# A list of interpreter backends your code will load. +# Each item must have at least `model_name`. +# For some models (Mistral, Llama 3.1), you must also provide HF_TOKEN. + +interpreter_config: + # Easiest path: no HF login required in your loader + - model_name: "Qwen/Qwen2.5-7B-Instruct" \ No newline at end of file From 809a14bb703d151507eada145affac7fbb622741 Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Fri, 29 Aug 2025 11:48:24 -0400 Subject: [PATCH 09/14] update readme --- scripts/description/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/description/README.md b/scripts/description/README.md index 07d9ea5..2e48b60 100644 --- a/scripts/description/README.md +++ b/scripts/description/README.md @@ -2,25 +2,23 @@ This tool loads utterance-level metrics and uses LLM interpreters to generate natural-language descriptions. ---- ## Files - `interpreter.py`: CLI entry point, loads config, metrics, runs interpreters, saves JSON. - `interpreter_shared.py`: utilities for loading metrics and models. - `text_llm_description.py`: **you implement** `describe_all(...)` to describe each utterance. ---- ## Example Input -### `scores.scp` +### scores.scp ``` utt_0001 {"SNR": 23.1, "WER": 0.08, "MOS": 4.2} utt_0002 {"SNR": 12.7, "WER": 0.30, "MOS": 3.0} ``` -### `egs/interpreter.yaml` +### egs/interpreter.yaml ```yaml interpreter_config: - model_name: "Qwen/Qwen2.5-7B-Instruct" From ecd0f17fbda6192f23396afda4db838527a7017c Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Fri, 29 Aug 2025 11:50:04 -0400 Subject: [PATCH 10/14] remove original files --- versa/bin/interpreter.py | 108 ------------------------------------ versa/interpreter_shared.py | 77 ------------------------- 2 files changed, 185 deletions(-) delete mode 100644 versa/bin/interpreter.py delete mode 100644 versa/interpreter_shared.py diff --git a/versa/bin/interpreter.py b/versa/bin/interpreter.py deleted file mode 100644 index f8a4767..0000000 --- a/versa/bin/interpreter.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 BoHao Su -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -"""Interpreter Interface for Speech Evaluation.""" - -import argparse -import json -import logging - -import torch -import yaml -from scripts.description.text_llm_description import describe_all -from versa.interpreter_shared import load_interpreter_modules, metric_loader_setup - - -def get_parser() -> argparse.Namespace: - """Get argument parser.""" - parser = argparse.ArgumentParser( - description="Interpretation for Speech Evaluation Interface" - ) - parser.add_argument( - "--score_output_file", - type=str, - default=None, - help="Path of directory of the score results.", - ) - parser.add_argument( - "--config", - required=True, - help="YAML with interpreter_config (list of model_name dicts)", - ) - parser.add_argument( - "--output_file", required=True, help="Where to dump the JSON descriptions" - ) - parser.add_argument( - "--use_gpu", type=bool, default=False, help="whether to use GPU if it can" - ) - parser.add_argument( - "--verbose", - default=1, - type=int, - help="Verbosity level. Higher is more logging.", - ) - parser.add_argument( - "--rank", - default=0, - type=int, - help="the overall rank in the batch processing, used to specify GPU rank", - ) - return parser - - -def main(): - args = get_parser().parse_args() - - # In case of using `local` backend, all GPU will be visible to all process. - if args.use_gpu: - gpu_rank = args.rank % torch.cuda.device_count() - torch.cuda.set_device(gpu_rank) - logging.info(f"using device: cuda:{gpu_rank}") - - # logging info - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - logging.warning("Skip DEBUG/INFO messages") - - metrics = metric_loader_setup(args.score_output_file) - logging.info("The number of utterances = %d" % len(metrics)) - - # 2) Load interpreter modules from YAML - with open(args.config) as cf: - cfg = yaml.safe_load(cf) - interpreter_modules = load_interpreter_modules( - cfg["interpreter_config"], - use_gpu=args.use_gpu, - ) - - # 3) Run description for each model - all_results = [] - for model_cfg in cfg["interpreter_config"]: - name = model_cfg["model_name"] - logging.info(f"Describing with {name}") - res = describe_all(metrics, name, interpreter_modules) - all_results.extend(res) - - # 4) Dump - with open(args.output_file, "w") as outf: - json.dump(all_results, outf, ensure_ascii=False, indent=2) - logging.info(f"Wrote descriptions to {args.output_file}") - - -if __name__ == "__main__": - main() diff --git a/versa/interpreter_shared.py b/versa/interpreter_shared.py deleted file mode 100644 index 1e23a3c..0000000 --- a/versa/interpreter_shared.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 BoHao Su -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -import json - -import torch -from huggingface_hub import login -from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline - - -def metric_loader_setup(score_output_file): - """ - Reads an scp-like file where each line is: - utt_id - Returns a dict mapping utt_id → metrics dict. - """ - data = {} - with open(score_output_file, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - utt_id, json_str = line.split(maxsplit=1) - data[utt_id] = json.loads(json_str) - return data - - -def load_interpreter_modules(interpreter_config, use_gpu): - assert interpreter_config, "no interpreter function is provided" - interpreter_modules = {} - for config in interpreter_config: - print(config, flush=True) - if config["model_name"] == "Qwen/Qwen2.5-7B-Instruct": - model = AutoModelForCausalLM.from_pretrained( - config["model_name"], - torch_dtype="auto", - device_map="auto" if use_gpu else None, - ) - tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) - interpreter_modules[config["model_name"]] = { - "args": { - "model": model, - "tokenizer": tokenizer, - }, - } - elif config["model_name"] == "mistralai/Mistral-7B-Instruct-v0.3": - login(token=config["HF_TOKEN"]) - model = AutoModelForCausalLM.from_pretrained( - config["model_name"], - torch_dtype="auto", - device_map="auto" if use_gpu else None, - ) - tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) - interpreter_modules[config["model_name"]] = { - "args": { - "model": model, - "tokenizer": tokenizer, - }, - } - elif config["model_name"] == "meta-llama/Llama-3.1-8B-Instruct": - login(token=config["HF_TOKEN"]) - pipe = pipeline( - "text-generation", - model=config["model_name"], - torch_dtype=torch.bfloat16, - device_map="auto" if use_gpu else None, - ) - interpreter_modules[config["model_name"]] = { - "args": { - "pipe": pipe, - }, - } - else: - raise ValueError(f"Unsupported model_name: {config['model_name']}") - return interpreter_modules From 67f0beda147ab72c914d113f3aed529e22026733 Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Fri, 29 Aug 2025 14:55:52 -0400 Subject: [PATCH 11/14] add chunking function in scripts that is able to chunk audios into segemnts first --- scripts/chunk_func/chunk.py | 145 ++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 scripts/chunk_func/chunk.py diff --git a/scripts/chunk_func/chunk.py b/scripts/chunk_func/chunk.py new file mode 100644 index 0000000..37aea35 --- /dev/null +++ b/scripts/chunk_func/chunk.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import os +import argparse +from pathlib import Path + +import numpy as np +import soundfile as sf +from tqdm import tqdm + +from versa.scorer_shared import ( + audio_loader_setup, + load_audio, + wav_normalize, +) + + +def get_parser() -> argparse.Namespace: + """Get argument parser.""" + parser = argparse.ArgumentParser(description="Chunk audios into fixed durations.") + parser.add_argument( + "--pred", + type=str, + required=True, + help="Wav.scp for generated waveforms, or a dir depending on --io.", + ) + parser.add_argument( + "--io", + type=str, + default="kaldi", + choices=["kaldi", "soundfile", "dir"], + help="IO interface to use.", + ) + parser.add_argument( + "--chunk_duration", + type=float, + default=3.0, + help="Duration (sec) of each chunk window.", + ) + parser.add_argument( + "--hop_duration", + type=float, + default=None, + help="Hop size (sec) between chunk starts. " + "If None, equals --chunk_duration (non-overlap).", + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Directory to write chunked wav files.", + ) + parser.add_argument( + "--min_last_chunk", + type=float, + default=0.0, + help="Minimum duration (sec) required to keep the final (short) chunk. " + "Set >0 to drop very short tails.", + ) + return parser + +def main(): + args = get_parser().parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if args.chunk_duration <= 0: + raise ValueError("--chunk_duration must be > 0") + + hop_duration = args.hop_duration if args.hop_duration is not None else args.chunk_duration + if hop_duration <= 0: + raise ValueError("--hop_duration must be > 0") + + if args.min_last_chunk < 0: + raise ValueError("--min_last_chunk must be >= 0") + + gen_files = audio_loader_setup(args.pred, args.io) + if len(gen_files) == 0: + raise FileNotFoundError("Not found any generated audio files from --pred with --io.") + + total_chunks = 0 + for key in tqdm(list(gen_files.keys()), desc="Chunking"): + src_path = gen_files[key] + try: + sr, wav = load_audio(src_path, args.io) + wav = wav_normalize(wav) + if wav.ndim > 1: + # Convert to mono if multichannel + wav = np.mean(wav, axis=-1) + except Exception as e: + print(f"[WARN] Failed to load {key} from {src_path}: {e}") + continue + + chunk_len = int(round(args.chunk_duration * sr)) + hop_len = int(round(hop_duration * sr)) + min_last_len = int(round(args.min_last_chunk * sr)) + + if chunk_len <= 0 or hop_len <= 0: + print(f"[WARN] Non-positive chunk/hop for key={key}; skipping.") + continue + + n_samples = len(wav) + if n_samples == 0: + print(f"[WARN] Empty audio for key={key}; skipping.") + continue + + # Iterate chunk start positions + chunk_idx = 0 + start = 0 + while start < n_samples: + end = start + chunk_len + if end > n_samples: + # last (short) chunk + if (n_samples - start) < min_last_len: + break # drop the tail if too short + end = n_samples + + chunk = wav[start:end] + if len(chunk) == 0: + break + + # Include time range in filename for traceability + t0 = start / sr + t1 = end / sr + out_name = f"{key}_chunk{chunk_idx:04d}_{t0:.3f}-{t1:.3f}.wav" + out_path = output_dir / out_name + + try: + sf.write(str(out_path), chunk, sr, subtype="PCM_16") + total_chunks += 1 + except Exception as e: + print(f"[WARN] Failed to write {out_path}: {e}") + + chunk_idx += 1 + start += hop_len + + print(f"Done. Wrote {total_chunks} chunks to: {output_dir.resolve()}") + + +if __name__ == "__main__": + main() From 08151054531d4b15605e366e78c52597eb1767ba Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 2 Sep 2025 17:26:59 -0400 Subject: [PATCH 12/14] create a scorer for chunk_extraction --- versa/bin/scorer_chunk.py | 402 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 versa/bin/scorer_chunk.py diff --git a/versa/bin/scorer_chunk.py b/versa/bin/scorer_chunk.py new file mode 100644 index 0000000..839a1cd --- /dev/null +++ b/versa/bin/scorer_chunk.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 + +# Copyright 2025 BoHao Su +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Scorer Interface for Speech Evaluation with optional CHUNKED scoring.""" + +import argparse +import logging +import os +from pathlib import Path +import re + +import numpy as np +import soundfile as sf +import torch +import yaml + +from versa.scorer_shared import ( + audio_loader_setup, + corpus_scoring, + list_scoring, + load_corpus_modules, + load_score_modules, + load_summary, + load_audio, + wav_normalize, +) + + +def get_parser() -> argparse.Namespace: + """Get argument parser.""" + parser = argparse.ArgumentParser(description="Speech Evaluation Interface") + parser.add_argument( + "--pred", + type=str, + help="Wav.scp for generated waveforms.", + ) + parser.add_argument( + "--score_config", type=str, default=None, help="Configuration of Score Config" + ) + parser.add_argument( + "--gt", + type=str, + default=None, + help="Wav.scp for ground truth waveforms.", + ) + parser.add_argument( + "--text", type=str, default=None, help="Path of ground truth transcription." + ) + parser.add_argument( + "--output_file", + type=str, + default=None, + help="Path of directory to write the results.", + ) + parser.add_argument( + "--cache_folder", type=str, default=None, help="Path of cache saving" + ) + parser.add_argument( + "--use_gpu", type=bool, default=False, help="whether to use GPU if it can" + ) + parser.add_argument( + "--io", + type=str, + default="kaldi", + choices=["kaldi", "soundfile", "dir"], + help="io interface to use", + ) + parser.add_argument( + "--verbose", + default=1, + type=int, + help="Verbosity level. Higher is more logging.", + ) + parser.add_argument( + "--rank", + default=0, + type=int, + help="the overall rank in the batch processing, used to specify GPU rank", + ) + parser.add_argument( + "--no_match", + action="store_true", + help="Do not match the groundtruth and generated files.", + ) + + # ---------- NEW: chunking options ---------- + parser.add_argument( + "--enable_chunking", + action="store_true", + help="If set, score on fixed-length chunks instead of full utterances.", + ) + parser.add_argument( + "--chunk_duration", + type=float, + default=0.5, + help="Chunk window length in seconds.", + ) + parser.add_argument( + "--hop_duration", + type=float, + default=0.2, + help="Hop size in seconds. If not set, equals --chunk_duration (no overlap).", + ) + parser.add_argument( + "--min_last_chunk", + type=float, + default=0.0, + help="Keep final short tail only if >= this many seconds. 0 to keep any tail.", + ) + parser.add_argument( + "--chunk_tmp_dir", + type=str, + default=None, + help="Directory to write temporary chunk wavs. " + "Defaults to .chunks or ./chunks when not provided.", + ) + # ------------------------------------------- + + return parser + +def _write_wav(path: Path, wav: np.ndarray, sr: int): + """Write mono PCM16 WAV safely.""" + path.parent.mkdir(parents=True, exist_ok=True) + if wav.ndim > 1: + wav = np.mean(wav, axis=-1) + sf.write(str(path), wav, sr, subtype="PCM_16") + + +def _chunk_bounds(n_samples: int, sr: int, chunk_sec: float, hop_sec: float, min_last_sec: float): + """Yield (start, end) sample indices for chunks covering [0, n_samples].""" + chunk_len = int(round(chunk_sec * sr)) + hop_len = int(round(hop_sec * sr)) + min_last = int(round(min_last_sec * sr)) + if chunk_len <= 0 or hop_len <= 0: + raise ValueError("chunk/hop must be > 0") + start = 0 + while start < n_samples: + end = start + chunk_len + if end > n_samples: + if n_samples - start < min_last: + break + end = n_samples + yield start, end + start += hop_len + + +def _chunk_pair_to_tmp( + key: str, + gen_path: str, + gt_path: str | None, + io: str, + chunk_sec: float, + hop_sec: float, + min_last_sec: float, + tmp_root: Path, +) -> tuple[dict, dict | None]: + """ + Chunk a generated file (and optionally its GT pair) into aligned windows. + - If GT is provided, both are truncated to the MIN of their lengths, then chunked + on the same boundaries for fair, aligned scoring. + Returns: + gen_chunks: {new_key -> wavpath} + gt_chunks: {new_key -> wavpath} or None + """ + # Load gen + gen_sr, gen_wav = load_audio(gen_path, io) + gen_wav = wav_normalize(gen_wav) + if gen_wav.ndim > 1: + gen_wav = np.mean(gen_wav, axis=-1) + n_gen = len(gen_wav) + + # Load gt (optional) + if gt_path is not None: + gt_sr, gt_wav = load_audio(gt_path, io) + gt_wav = wav_normalize(gt_wav) + if gt_wav.ndim > 1: + gt_wav = np.mean(gt_wav, axis=-1) + # Resample check (assume same SR; if not, we must resample – here we assert) + if gt_sr != gen_sr: + raise ValueError(f"SR mismatch for key={key}: gen {gen_sr} vs gt {gt_sr}") + n_gt = len(gt_wav) + n_use = min(n_gen, n_gt) + gen_wav = gen_wav[:n_use] + gt_wav = gt_wav[:n_use] + else: + gt_wav = None + n_use = n_gen + + gen_out = {} + gt_out = {} if gt_wav is not None else None + + for idx, (s, e) in enumerate(_chunk_bounds(n_use, gen_sr, chunk_sec, hop_sec, min_last_sec)): + t0 = s / gen_sr + t1 = e / gen_sr + new_key = f"{key}@{t0:.3f}-{t1:.3f}" + stem = f"{key}_chunk{idx:04d}_{t0:.3f}-{t1:.3f}" + + gen_path_out = tmp_root / "pred" / f"{stem}.wav" + _write_wav(gen_path_out, gen_wav[s:e], gen_sr) + gen_out[new_key] = str(gen_path_out) + + if gt_wav is not None: + gt_path_out = tmp_root / "gt" / f"{stem}.wav" + _write_wav(gt_path_out, gt_wav[s:e], gen_sr) + gt_out[new_key] = str(gt_path_out) + + return gen_out, gt_out + + +def _maybe_chunk_filelists( + args, + gen_files: dict, + gt_files: dict | None, + text_info: dict | None, +) -> tuple[dict, dict | None, dict | None, Path | None]: + """ + If chunking is enabled, create on-disk chunked wavs and return updated mappings. + Also replicates text_info per chunk key. + """ + if not args.enable_chunking: + return gen_files, gt_files, text_info, None + + chunk_sec = float(args.chunk_duration) + hop_sec = float(args.hop_duration) if args.hop_duration is not None else chunk_sec + min_last_sec = float(args.min_last_chunk) + + # Choose temp root for chunks + if args.chunk_tmp_dir: + tmp_root = Path(args.chunk_tmp_dir) + elif args.output_file: + tmp_root = Path(str(args.output_file) + ".chunks") + else: + tmp_root = Path("./chunks") + tmp_root.mkdir(parents=True, exist_ok=True) + + logging.info( + f"Chunking enabled: chunk={chunk_sec}s, hop={hop_sec}s, min_last={min_last_sec}s, dir={tmp_root}" + ) + + gen_chunks_all: dict = {} + gt_chunks_all: dict | None = {} if gt_files is not None else None + text_chunks_all: dict | None = {} if text_info is not None else None + + for key, pred_path in gen_files.items(): + gt_path = gt_files.get(key) if gt_files is not None else None + try: + g_map, r_map = _chunk_pair_to_tmp( + key, + pred_path, + gt_path, + args.io, + chunk_sec, + hop_sec, + min_last_sec, + tmp_root, + ) + except Exception as e: + logging.warning(f"Chunking failed for key={key}: {e}") + continue + + # Merge into global dicts + gen_chunks_all.update(g_map) + if gt_chunks_all is not None and r_map is not None: + gt_chunks_all.update(r_map) + elif gt_chunks_all is not None and r_map is None: + # keep structure consistent + gt_chunks_all = None + + # Duplicate text per chunk if provided + if text_chunks_all is not None and text_info is not None and key in text_info: + for ck in g_map.keys(): + text_chunks_all[ck] = text_info[key] + + return gen_chunks_all, gt_chunks_all, text_chunks_all, tmp_root + + +def main(): + args = get_parser().parse_args() + + # In case of using `local` backend, all GPU will be visible to all process. + if args.use_gpu: + gpu_rank = args.rank % torch.cuda.device_count() + torch.cuda.set_device(gpu_rank) + logging.info(f"using device: cuda:{gpu_rank}") + + # logging info + if args.verbose > 1: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + elif args.verbose > 0: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.warning("Skip DEBUG/INFO messages") + + gen_files = audio_loader_setup(args.pred, args.io) + + # find reference file + args.gt = None if args.gt == "None" else args.gt + if args.gt is not None and not args.no_match: + gt_files = audio_loader_setup(args.gt, args.io) + else: + gt_files = None + + # find ground truth transcription + if args.text is not None: + text_info = {} + with open(args.text) as f: + for line in f.readlines(): + key, value = line.strip().split(maxsplit=1) + text_info[key] = value + else: + text_info = None + + # Get and divide list + if len(gen_files) == 0: + raise FileNotFoundError("Not found any generated audio files.") + if gt_files is not None and len(gen_files) > len(gt_files) and not args.enable_chunking: + # (For chunking, we later truncate to min length per pair, so we don't pre-check count equality.) + raise ValueError( + "#groundtruth files are less than #generated files " + f"(#gen={len(gen_files)} vs. #gt={len(gt_files)}). " + "Please check the groundtruth directory." + ) + + logging.info("The number of utterances (pre-chunk) = %d", len(gen_files)) + + # Optional: build chunked filelists and override maps + gen_files, gt_files, text_info, chunk_tmp_dir = _maybe_chunk_filelists( + args, gen_files, gt_files, text_info + ) + + if args.enable_chunking: + logging.info("The number of items (post-chunk) = %d", len(gen_files)) + + with open(args.score_config, "r", encoding="utf-8") as f: + score_config = yaml.full_load(f) + + score_modules = load_score_modules( + score_config, + use_gt=(True if gt_files is not None else False), + use_gt_text=(True if text_info is not None else False), + use_gpu=args.use_gpu, + ) + + if len(score_modules) > 0: + score_info = list_scoring( + gen_files, + score_modules, + gt_files, + text_info, + output_file=args.output_file, + io=args.io, + ) + logging.info("Summary: %s", load_summary(score_info)) + else: + logging.info("No utterance-level scoring function is provided.") + + corpus_score_modules = load_corpus_modules( + score_config, + use_gpu=args.use_gpu, + cache_folder=args.cache_folder, + io=args.io, + ) + assert ( + len(corpus_score_modules) > 0 or len(score_modules) > 0 + ), "no scoring function is provided" + + # NOTE: For corpus scoring we keep original (non-chunked) paths unless you explicitly want + # to aggregate over chunks. If you want corpus over chunks, pass args.pred as the CHUNK TMP dir + # and ensure your corpus scorer supports directory inputs. + if len(corpus_score_modules) > 0: + pred_for_corpus = args.pred + if args.enable_chunking and chunk_tmp_dir is not None: + # Optionally switch corpus to chunk directory: + pred_for_corpus = str(chunk_tmp_dir / "pred") + logging.info(f"Corpus scoring over chunk directory: {pred_for_corpus}") + + corpus_score_info = corpus_scoring( + pred_for_corpus, + corpus_score_modules, + args.gt if (args.gt is not None and not args.enable_chunking) else None, + text_info if (text_info is not None and args.enable_chunking) else None, + output_file=(args.output_file + ".corpus") if args.output_file else None, + ) + logging.info("Corpus Summary: %s", corpus_score_info) + else: + logging.info("No corpus-level scoring function is provided.") + + +if __name__ == "__main__": + main() From 933cd08a5e12b1a5c0c09855abaf9bae8ce2e7ea Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 9 Sep 2025 12:35:34 -0400 Subject: [PATCH 13/14] run isort and black --- versa/bin/scorer_chunk.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/versa/bin/scorer_chunk.py b/versa/bin/scorer_chunk.py index 839a1cd..0b3785d 100644 --- a/versa/bin/scorer_chunk.py +++ b/versa/bin/scorer_chunk.py @@ -8,22 +8,21 @@ import argparse import logging import os -from pathlib import Path import re +from pathlib import Path import numpy as np import soundfile as sf import torch import yaml - from versa.scorer_shared import ( audio_loader_setup, corpus_scoring, list_scoring, + load_audio, load_corpus_modules, load_score_modules, load_summary, - load_audio, wav_normalize, ) @@ -114,12 +113,13 @@ def get_parser() -> argparse.Namespace: type=str, default=None, help="Directory to write temporary chunk wavs. " - "Defaults to .chunks or ./chunks when not provided.", + "Defaults to .chunks or ./chunks when not provided.", ) # ------------------------------------------- return parser + def _write_wav(path: Path, wav: np.ndarray, sr: int): """Write mono PCM16 WAV safely.""" path.parent.mkdir(parents=True, exist_ok=True) @@ -128,7 +128,9 @@ def _write_wav(path: Path, wav: np.ndarray, sr: int): sf.write(str(path), wav, sr, subtype="PCM_16") -def _chunk_bounds(n_samples: int, sr: int, chunk_sec: float, hop_sec: float, min_last_sec: float): +def _chunk_bounds( + n_samples: int, sr: int, chunk_sec: float, hop_sec: float, min_last_sec: float +): """Yield (start, end) sample indices for chunks covering [0, n_samples].""" chunk_len = int(round(chunk_sec * sr)) hop_len = int(round(hop_sec * sr)) @@ -191,7 +193,9 @@ def _chunk_pair_to_tmp( gen_out = {} gt_out = {} if gt_wav is not None else None - for idx, (s, e) in enumerate(_chunk_bounds(n_use, gen_sr, chunk_sec, hop_sec, min_last_sec)): + for idx, (s, e) in enumerate( + _chunk_bounds(n_use, gen_sr, chunk_sec, hop_sec, min_last_sec) + ): t0 = s / gen_sr t1 = e / gen_sr new_key = f"{key}@{t0:.3f}-{t1:.3f}" @@ -325,7 +329,11 @@ def main(): # Get and divide list if len(gen_files) == 0: raise FileNotFoundError("Not found any generated audio files.") - if gt_files is not None and len(gen_files) > len(gt_files) and not args.enable_chunking: + if ( + gt_files is not None + and len(gen_files) > len(gt_files) + and not args.enable_chunking + ): # (For chunking, we later truncate to min length per pair, so we don't pre-check count equality.) raise ValueError( "#groundtruth files are less than #generated files " From 6a734f32bd416645d5bc8613493b9c8131728ea2 Mon Sep 17 00:00:00 2001 From: BohaoSu Date: Tue, 9 Sep 2025 12:38:07 -0400 Subject: [PATCH 14/14] run isort and blakc on chunk.py --- scripts/chunk_func/chunk.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/chunk_func/chunk.py b/scripts/chunk_func/chunk.py index 37aea35..89540d9 100644 --- a/scripts/chunk_func/chunk.py +++ b/scripts/chunk_func/chunk.py @@ -3,19 +3,14 @@ # Copyright 2025 BoHao Su # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -import os import argparse +import os from pathlib import Path import numpy as np import soundfile as sf from tqdm import tqdm - -from versa.scorer_shared import ( - audio_loader_setup, - load_audio, - wav_normalize, -) +from versa.scorer_shared import audio_loader_setup, load_audio, wav_normalize def get_parser() -> argparse.Namespace: @@ -45,7 +40,7 @@ def get_parser() -> argparse.Namespace: type=float, default=None, help="Hop size (sec) between chunk starts. " - "If None, equals --chunk_duration (non-overlap).", + "If None, equals --chunk_duration (non-overlap).", ) parser.add_argument( "--output_dir", @@ -58,10 +53,11 @@ def get_parser() -> argparse.Namespace: type=float, default=0.0, help="Minimum duration (sec) required to keep the final (short) chunk. " - "Set >0 to drop very short tails.", + "Set >0 to drop very short tails.", ) return parser + def main(): args = get_parser().parse_args() @@ -71,7 +67,9 @@ def main(): if args.chunk_duration <= 0: raise ValueError("--chunk_duration must be > 0") - hop_duration = args.hop_duration if args.hop_duration is not None else args.chunk_duration + hop_duration = ( + args.hop_duration if args.hop_duration is not None else args.chunk_duration + ) if hop_duration <= 0: raise ValueError("--hop_duration must be > 0") @@ -80,7 +78,9 @@ def main(): gen_files = audio_loader_setup(args.pred, args.io) if len(gen_files) == 0: - raise FileNotFoundError("Not found any generated audio files from --pred with --io.") + raise FileNotFoundError( + "Not found any generated audio files from --pred with --io." + ) total_chunks = 0 for key in tqdm(list(gen_files.keys()), desc="Chunking"):