Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions egs/interpreter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# interpreter example yaml config
# A list of interpreter backends your code will load.
# Each item must have at least `model_name`.
# For some models (Mistral, Llama 3.1), you must also provide HF_TOKEN.

interpreter_config:
# Easiest path: no HF login required in your loader
- model_name: "Qwen/Qwen2.5-7B-Instruct"
145 changes: 145 additions & 0 deletions scripts/chunk_func/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/usr/bin/env python3

# Copyright 2025 BoHao Su
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import os
from pathlib import Path

import numpy as np
import soundfile as sf
from tqdm import tqdm
from versa.scorer_shared import audio_loader_setup, load_audio, wav_normalize


def get_parser() -> argparse.Namespace:
"""Get argument parser."""
parser = argparse.ArgumentParser(description="Chunk audios into fixed durations.")
parser.add_argument(
"--pred",
type=str,
required=True,
help="Wav.scp for generated waveforms, or a dir depending on --io.",
)
parser.add_argument(
"--io",
type=str,
default="kaldi",
choices=["kaldi", "soundfile", "dir"],
help="IO interface to use.",
)
parser.add_argument(
"--chunk_duration",
type=float,
default=3.0,
help="Duration (sec) of each chunk window.",
)
parser.add_argument(
"--hop_duration",
type=float,
default=None,
help="Hop size (sec) between chunk starts. "
"If None, equals --chunk_duration (non-overlap).",
)
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="Directory to write chunked wav files.",
)
parser.add_argument(
"--min_last_chunk",
type=float,
default=0.0,
help="Minimum duration (sec) required to keep the final (short) chunk. "
"Set >0 to drop very short tails.",
)
return parser


def main():
args = get_parser().parse_args()

output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

if args.chunk_duration <= 0:
raise ValueError("--chunk_duration must be > 0")

hop_duration = (
args.hop_duration if args.hop_duration is not None else args.chunk_duration
)
if hop_duration <= 0:
raise ValueError("--hop_duration must be > 0")

if args.min_last_chunk < 0:
raise ValueError("--min_last_chunk must be >= 0")

gen_files = audio_loader_setup(args.pred, args.io)
if len(gen_files) == 0:
raise FileNotFoundError(
"Not found any generated audio files from --pred with --io."
)

total_chunks = 0
for key in tqdm(list(gen_files.keys()), desc="Chunking"):
src_path = gen_files[key]
try:
sr, wav = load_audio(src_path, args.io)
wav = wav_normalize(wav)
if wav.ndim > 1:
# Convert to mono if multichannel
wav = np.mean(wav, axis=-1)
except Exception as e:
print(f"[WARN] Failed to load {key} from {src_path}: {e}")
continue

chunk_len = int(round(args.chunk_duration * sr))
hop_len = int(round(hop_duration * sr))
min_last_len = int(round(args.min_last_chunk * sr))

if chunk_len <= 0 or hop_len <= 0:
print(f"[WARN] Non-positive chunk/hop for key={key}; skipping.")
continue

n_samples = len(wav)
if n_samples == 0:
print(f"[WARN] Empty audio for key={key}; skipping.")
continue

# Iterate chunk start positions
chunk_idx = 0
start = 0
while start < n_samples:
end = start + chunk_len
if end > n_samples:
# last (short) chunk
if (n_samples - start) < min_last_len:
break # drop the tail if too short
end = n_samples

chunk = wav[start:end]
if len(chunk) == 0:
break

# Include time range in filename for traceability
t0 = start / sr
t1 = end / sr
out_name = f"{key}_chunk{chunk_idx:04d}_{t0:.3f}-{t1:.3f}.wav"
out_path = output_dir / out_name

try:
sf.write(str(out_path), chunk, sr, subtype="PCM_16")
total_chunks += 1
except Exception as e:
print(f"[WARN] Failed to write {out_path}: {e}")

chunk_idx += 1
start += hop_len

print(f"Done. Wrote {total_chunks} chunks to: {output_dir.resolve()}")


if __name__ == "__main__":
main()
36 changes: 36 additions & 0 deletions scripts/description/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Speech Evaluation Interpreter

This tool loads utterance-level metrics and uses LLM interpreters to generate natural-language descriptions.


## Files
- `interpreter.py`: CLI entry point, loads config, metrics, runs interpreters, saves JSON.
- `interpreter_shared.py`: utilities for loading metrics and models.
- `text_llm_description.py`: **you implement** `describe_all(...)` to describe each utterance.


## Example Input

### scores.scp

```
utt_0001 {"SNR": 23.1, "WER": 0.08, "MOS": 4.2}
utt_0002 {"SNR": 12.7, "WER": 0.30, "MOS": 3.0}
```

### egs/interpreter.yaml
```yaml
interpreter_config:
- model_name: "Qwen/Qwen2.5-7B-Instruct"
```

## Run

```bash
python interpreter.py \
--config egs/interpreter.yaml \
--score_output_file scores.scp \
--output_file descriptions.json \
--use_gpu False \
--verbose 1
```
108 changes: 108 additions & 0 deletions scripts/description/interpreter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python3

# Copyright 2025 BoHao Su
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

"""Interpreter Interface for Speech Evaluation."""

import argparse
import json
import logging

import torch
import yaml
from text_llm_description import describe_all
from interpreter_shared import load_interpreter_modules, metric_loader_setup


def get_parser() -> argparse.Namespace:
"""Get argument parser."""
parser = argparse.ArgumentParser(
description="Interpretation for Speech Evaluation Interface"
)
parser.add_argument(
"--score_output_file",
type=str,
default=None,
help="Path of directory of the score results.",
)
parser.add_argument(
"--config",
required=True,
help="YAML with interpreter_config (list of model_name dicts)",
)
parser.add_argument(
"--output_file", required=True, help="Where to dump the JSON descriptions"
)
parser.add_argument(
"--use_gpu", type=bool, default=False, help="whether to use GPU if it can"
)
parser.add_argument(
"--verbose",
default=1,
type=int,
help="Verbosity level. Higher is more logging.",
)
parser.add_argument(
"--rank",
default=0,
type=int,
help="the overall rank in the batch processing, used to specify GPU rank",
)
return parser


def main():
args = get_parser().parse_args()

# In case of using `local` backend, all GPU will be visible to all process.
if args.use_gpu:
gpu_rank = args.rank % torch.cuda.device_count()
torch.cuda.set_device(gpu_rank)
logging.info(f"using device: cuda:{gpu_rank}")

# logging info
if args.verbose > 1:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
elif args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")

metrics = metric_loader_setup(args.score_output_file)
logging.info("The number of utterances = %d" % len(metrics))

# 2) Load interpreter modules from YAML
with open(args.config) as cf:
cfg = yaml.safe_load(cf)
interpreter_modules = load_interpreter_modules(
cfg["interpreter_config"],
use_gpu=args.use_gpu,
)

# 3) Run description for each model
all_results = []
for model_cfg in cfg["interpreter_config"]:
name = model_cfg["model_name"]
logging.info(f"Describing with {name}")
res = describe_all(metrics, name, interpreter_modules)
all_results.extend(res)

# 4) Dump
with open(args.output_file, "w") as outf:
json.dump(all_results, outf, ensure_ascii=False, indent=2)
logging.info(f"Wrote descriptions to {args.output_file}")


if __name__ == "__main__":
main()
77 changes: 77 additions & 0 deletions scripts/description/interpreter_shared.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python3

# Copyright 2025 BoHao Su
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

import json

import torch
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


def metric_loader_setup(score_output_file):
"""
Reads an scp-like file where each line is:
utt_id <JSON-metrics>
Returns a dict mapping utt_id → metrics dict.
"""
data = {}
with open(score_output_file, "r") as f:
for line in f:
line = line.strip()
if not line:
continue
utt_id, json_str = line.split(maxsplit=1)
data[utt_id] = json.loads(json_str)
return data


def load_interpreter_modules(interpreter_config, use_gpu):
assert interpreter_config, "no interpreter function is provided"
interpreter_modules = {}
for config in interpreter_config:
print(config, flush=True)
if config["model_name"] == "Qwen/Qwen2.5-7B-Instruct":
model = AutoModelForCausalLM.from_pretrained(
config["model_name"],
torch_dtype="auto",
device_map="auto" if use_gpu else None,
)
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
interpreter_modules[config["model_name"]] = {
"args": {
"model": model,
"tokenizer": tokenizer,
},
}
elif config["model_name"] == "mistralai/Mistral-7B-Instruct-v0.3":
login(token=config["HF_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(
config["model_name"],
torch_dtype="auto",
device_map="auto" if use_gpu else None,
)
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
interpreter_modules[config["model_name"]] = {
"args": {
"model": model,
"tokenizer": tokenizer,
},
}
elif config["model_name"] == "meta-llama/Llama-3.1-8B-Instruct":
login(token=config["HF_TOKEN"])
pipe = pipeline(
"text-generation",
model=config["model_name"],
torch_dtype=torch.bfloat16,
device_map="auto" if use_gpu else None,
)
interpreter_modules[config["model_name"]] = {
"args": {
"pipe": pipe,
},
}
else:
raise ValueError(f"Unsupported model_name: {config['model_name']}")
return interpreter_modules
Loading