wavlab-speech · ftshijt · Sep 9, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/egs/interpreter.yaml b/egs/interpreter.yaml
@@ -0,0 +1,8 @@
+# interpreter example yaml config
+# A list of interpreter backends your code will load.
+# Each item must have at least `model_name`.
+# For some models (Mistral, Llama 3.1), you must also provide HF_TOKEN.
+
+interpreter_config:
+  # Easiest path: no HF login required in your loader
+  - model_name: "Qwen/Qwen2.5-7B-Instruct"
diff --git a/scripts/chunk_func/chunk.py b/scripts/chunk_func/chunk.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 BoHao Su
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+from versa.scorer_shared import audio_loader_setup, load_audio, wav_normalize
+
+
+def get_parser() -> argparse.Namespace:
+    """Get argument parser."""
+    parser = argparse.ArgumentParser(description="Chunk audios into fixed durations.")
+    parser.add_argument(
+        "--pred",
+        type=str,
+        required=True,
+        help="Wav.scp for generated waveforms, or a dir depending on --io.",
+    )
+    parser.add_argument(
+        "--io",
+        type=str,
+        default="kaldi",
+        choices=["kaldi", "soundfile", "dir"],
+        help="IO interface to use.",
+    )
+    parser.add_argument(
+        "--chunk_duration",
+        type=float,
+        default=3.0,
+        help="Duration (sec) of each chunk window.",
+    )
+    parser.add_argument(
+        "--hop_duration",
+        type=float,
+        default=None,
+        help="Hop size (sec) between chunk starts. "
+        "If None, equals --chunk_duration (non-overlap).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Directory to write chunked wav files.",
+    )
+    parser.add_argument(
+        "--min_last_chunk",
+        type=float,
+        default=0.0,
+        help="Minimum duration (sec) required to keep the final (short) chunk. "
+        "Set >0 to drop very short tails.",
+    )
+    return parser
+
+
+def main():
+    args = get_parser().parse_args()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.chunk_duration <= 0:
+        raise ValueError("--chunk_duration must be > 0")
+
+    hop_duration = (
+        args.hop_duration if args.hop_duration is not None else args.chunk_duration
+    )
+    if hop_duration <= 0:
+        raise ValueError("--hop_duration must be > 0")
+
+    if args.min_last_chunk < 0:
+        raise ValueError("--min_last_chunk must be >= 0")
+
+    gen_files = audio_loader_setup(args.pred, args.io)
+    if len(gen_files) == 0:
+        raise FileNotFoundError(
+            "Not found any generated audio files from --pred with --io."
+        )
+
+    total_chunks = 0
+    for key in tqdm(list(gen_files.keys()), desc="Chunking"):
+        src_path = gen_files[key]
+        try:
+            sr, wav = load_audio(src_path, args.io)
+            wav = wav_normalize(wav)
+            if wav.ndim > 1:
+                # Convert to mono if multichannel
+                wav = np.mean(wav, axis=-1)
+        except Exception as e:
+            print(f"[WARN] Failed to load {key} from {src_path}: {e}")
+            continue
+
+        chunk_len = int(round(args.chunk_duration * sr))
+        hop_len = int(round(hop_duration * sr))
+        min_last_len = int(round(args.min_last_chunk * sr))
+
+        if chunk_len <= 0 or hop_len <= 0:
+            print(f"[WARN] Non-positive chunk/hop for key={key}; skipping.")
+            continue
+
+        n_samples = len(wav)
+        if n_samples == 0:
+            print(f"[WARN] Empty audio for key={key}; skipping.")
+            continue
+
+        # Iterate chunk start positions
+        chunk_idx = 0
+        start = 0
+        while start < n_samples:
+            end = start + chunk_len
+            if end > n_samples:
+                # last (short) chunk
+                if (n_samples - start) < min_last_len:
+                    break  # drop the tail if too short
+                end = n_samples
+
+            chunk = wav[start:end]
+            if len(chunk) == 0:
+                break
+
+            # Include time range in filename for traceability
+            t0 = start / sr
+            t1 = end / sr
+            out_name = f"{key}_chunk{chunk_idx:04d}_{t0:.3f}-{t1:.3f}.wav"
+            out_path = output_dir / out_name
+
+            try:
+                sf.write(str(out_path), chunk, sr, subtype="PCM_16")
+                total_chunks += 1
+            except Exception as e:
+                print(f"[WARN] Failed to write {out_path}: {e}")
+
+            chunk_idx += 1
+            start += hop_len
+
+    print(f"Done. Wrote {total_chunks} chunks to: {output_dir.resolve()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/description/README.md b/scripts/description/README.md
@@ -0,0 +1,36 @@
+# Speech Evaluation Interpreter
+
+This tool loads utterance-level metrics and uses LLM interpreters to generate natural-language descriptions.
+
+
+## Files
+- `interpreter.py`: CLI entry point, loads config, metrics, runs interpreters, saves JSON.
+- `interpreter_shared.py`: utilities for loading metrics and models.
+- `text_llm_description.py`: **you implement** `describe_all(...)` to describe each utterance.
+
+
+## Example Input
+
+### scores.scp
+
+```
+utt_0001 {"SNR": 23.1, "WER": 0.08, "MOS": 4.2}
+utt_0002 {"SNR": 12.7, "WER": 0.30, "MOS": 3.0}
+```
+
+### egs/interpreter.yaml
+```yaml
+interpreter_config:
+  - model_name: "Qwen/Qwen2.5-7B-Instruct"
+```
+
+## Run
+
+```bash
+python interpreter.py \
+  --config egs/interpreter.yaml \
+  --score_output_file scores.scp \
+  --output_file descriptions.json \
+  --use_gpu False \
+  --verbose 1
+```
diff --git a/scripts/description/interpreter.py b/scripts/description/interpreter.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 BoHao Su
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Interpreter Interface for Speech Evaluation."""
+
+import argparse
+import json
+import logging
+
+import torch
+import yaml
+from text_llm_description import describe_all
+from interpreter_shared import load_interpreter_modules, metric_loader_setup
+
+
+def get_parser() -> argparse.Namespace:
+    """Get argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Interpretation for Speech Evaluation Interface"
+    )
+    parser.add_argument(
+        "--score_output_file",
+        type=str,
+        default=None,
+        help="Path of directory of the score results.",
+    )
+    parser.add_argument(
+        "--config",
+        required=True,
+        help="YAML with interpreter_config (list of model_name dicts)",
+    )
+    parser.add_argument(
+        "--output_file", required=True, help="Where to dump the JSON descriptions"
+    )
+    parser.add_argument(
+        "--use_gpu", type=bool, default=False, help="whether to use GPU if it can"
+    )
+    parser.add_argument(
+        "--verbose",
+        default=1,
+        type=int,
+        help="Verbosity level. Higher is more logging.",
+    )
+    parser.add_argument(
+        "--rank",
+        default=0,
+        type=int,
+        help="the overall rank in the batch processing, used to specify GPU rank",
+    )
+    return parser
+
+
+def main():
+    args = get_parser().parse_args()
+
+    # In case of using `local` backend, all GPU will be visible to all process.
+    if args.use_gpu:
+        gpu_rank = args.rank % torch.cuda.device_count()
+        torch.cuda.set_device(gpu_rank)
+        logging.info(f"using device: cuda:{gpu_rank}")
+
+    # logging info
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    metrics = metric_loader_setup(args.score_output_file)
+    logging.info("The number of utterances = %d" % len(metrics))
+
+    # 2) Load interpreter modules from YAML
+    with open(args.config) as cf:
+        cfg = yaml.safe_load(cf)
+    interpreter_modules = load_interpreter_modules(
+        cfg["interpreter_config"],
+        use_gpu=args.use_gpu,
+    )
+
+    # 3) Run description for each model
+    all_results = []
+    for model_cfg in cfg["interpreter_config"]:
+        name = model_cfg["model_name"]
+        logging.info(f"Describing with {name}")
+        res = describe_all(metrics, name, interpreter_modules)
+        all_results.extend(res)
+
+    # 4) Dump
+    with open(args.output_file, "w") as outf:
+        json.dump(all_results, outf, ensure_ascii=False, indent=2)
+    logging.info(f"Wrote descriptions to {args.output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/description/interpreter_shared.py b/scripts/description/interpreter_shared.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 BoHao Su
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+
+import torch
+from huggingface_hub import login
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+
+def metric_loader_setup(score_output_file):
+    """
+    Reads an scp-like file where each line is:
+      utt_id <JSON-metrics>
+    Returns a dict mapping utt_id → metrics dict.
+    """
+    data = {}
+    with open(score_output_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            utt_id, json_str = line.split(maxsplit=1)
+            data[utt_id] = json.loads(json_str)
+    return data
+
+
+def load_interpreter_modules(interpreter_config, use_gpu):
+    assert interpreter_config, "no interpreter function is provided"
+    interpreter_modules = {}
+    for config in interpreter_config:
+        print(config, flush=True)
+        if config["model_name"] == "Qwen/Qwen2.5-7B-Instruct":
+            model = AutoModelForCausalLM.from_pretrained(
+                config["model_name"],
+                torch_dtype="auto",
+                device_map="auto" if use_gpu else None,
+            )
+            tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
+            interpreter_modules[config["model_name"]] = {
+                "args": {
+                    "model": model,
+                    "tokenizer": tokenizer,
+                },
+            }
+        elif config["model_name"] == "mistralai/Mistral-7B-Instruct-v0.3":
+            login(token=config["HF_TOKEN"])
+            model = AutoModelForCausalLM.from_pretrained(
+                config["model_name"],
+                torch_dtype="auto",
+                device_map="auto" if use_gpu else None,
+            )
+            tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
+            interpreter_modules[config["model_name"]] = {
+                "args": {
+                    "model": model,
+                    "tokenizer": tokenizer,
+                },
+            }
+        elif config["model_name"] == "meta-llama/Llama-3.1-8B-Instruct":
+            login(token=config["HF_TOKEN"])
+            pipe = pipeline(
+                "text-generation",
+                model=config["model_name"],
+                torch_dtype=torch.bfloat16,
+                device_map="auto" if use_gpu else None,
+            )
+            interpreter_modules[config["model_name"]] = {
+                "args": {
+                    "pipe": pipe,
+                },
+            }
+        else:
+            raise ValueError(f"Unsupported model_name: {config['model_name']}")
+    return interpreter_modules