From e12bb6a6ff87538106839a5518fb2bfc8725c431 Mon Sep 17 00:00:00 2001
From: Stanwang1210 <ch995308@gmail.com>
Date: Tue, 24 Jun 2025 17:02:33 +0800
Subject: [PATCH 1/2] init glm 4 voice

---
 versa/utterance_metrics/glm_4_voice.py | 540 +++++++++++++++++++++++++
 1 file changed, 540 insertions(+)
 create mode 100644 versa/utterance_metrics/glm_4_voice.py

diff --git a/versa/utterance_metrics/glm_4_voice.py b/versa/utterance_metrics/glm_4_voice.py
new file mode 100644
index 0000000..9361621
--- /dev/null
+++ b/versa/utterance_metrics/glm_4_voice.py
@@ -0,0 +1,540 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Jiatong Shi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""
+Speech Properties for Metadata Modeling (Kimi-based)
+
+This module provides functions for extracting various speech properties
+from audio using GLM-4-Voice. The properties are organized into the
+following categories:
+
+1. Speaker Characteristics
+    - glm_4_voice_speaker_count_metric: Number of distinct speakers
+    - glm_4_voice_speaker_gender_metric: Gender of speaker(s)
+    - glm_4_voice_speaker_age_metric: Age group of speaker(s)
+    - glm_4_voice_speech_impairment_metric: Presence and type of speech disorders
+
+2. Voice Properties
+    - glm_4_voice_voice_pitch_metric: Overall pitch level
+    - glm_4_voice_pitch_range_metric: Variation in intonation
+    - glm_4_voice_voice_type_metric: Voice texture characteristics
+    - glm_4_voice_speech_volume_level_metric: Loudness of speech
+
+3. Speech Content
+    - glm_4_voice_language_metric: Language(s) being spoken
+    - glm_4_voice_speech_register_metric: Level of formality in speech
+    - glm_4_voice_vocabulary_complexity_metric: Sophistication of word choice
+    - glm_4_voice_speech_purpose_metric: Communicative goal of speech
+
+4. Speech Delivery
+    - glm_4_voice_speech_emotion_metric: Emotional state conveyed
+    - glm_4_voice_speech_clarity_metric: Intelligibility of speech
+    - glm_4_voice_speech_rate_metric: Speed of delivery
+    - glm_4_voice_speaking_style_metric: Overall presentation manner
+    - glm_4_voice_laughter_crying_metric: Presence of emotional vocalizations
+
+5. Interaction Patterns
+    - glm_4_voice_overlapping_speech_metric: Degree of simultaneous speech
+
+6. Recording Environment
+    - glm_4_voice_speech_background_environment_metric: Setting where recorded
+    - glm_4_voice_recording_quality_metric: Technical quality of recording
+    - glm_4_voice_channel_type_metric: Equipment used for recording
+
+7. Vocal Evaluation
+    - glm_4_voice_singing_technique_metric: Singing Techniques (styles)
+
+Each function follows the same signature pattern:
+    glm_4_voice_utils: Dictionary containing model, processor, and conversation
+    pred_x: Audio signal as numpy array
+    fs: Sampling rate in Hz (default 16000)
+    custom_prompt: Optional custom prompt to override default
+
+Each function returns a dictionary with a single key-value pair where
+the key is the metric name prefixed with "glm_4_voice_" and the value is the
+model's response.
+"""
+import sys
+import copy
+import logging
+from typing import Dict, Optional, Any
+import os
+import librosa
+import numpy as np
+import torch
+
+try:
+   from transformers import AutoModel, AutoTokenizer, WhisperFeatureExtractor, BitsAndBytesConfig
+except ImportError:
+    logging.warning(
+        "If transformers is not found with key error, please follow the installation in the latest version of GLM-4-Voice and retry."
+    )
+    AutoModel, AutoTokenizer, WhisperFeatureExtractor, BitsAndBytesConfig = None, None, None, None
+
+# TO-DO: Remove this line when the GLM-4-Voice is installed in the environment
+# This is a workaround to ensure the GLM-4-Voice model can be imported correctly
+library_path = os.path.expanduser("~/GLM-4-Voice")
+if library_path not in sys.path:
+    sys.path.insert(0, library_path)
+
+from speech_tokenizer.modeling_whisper import WhisperVQEncoder
+from speech_tokenizer.utils import extract_speech_token
+# from audio_process import AudioStreamProcessor
+# Default prompts for different metrics
+DEFAULT_PROMPTS = {
+    # Speaker Characteristics
+    "speaker_count": """Analyze the audio and determine the number of distinct speakers present.
+Provide your answer as a single number between 1-10.
+Examples:
+- For a monologue: 1
+- For an interview with host and guest: 2 
+- For a panel discussion with a moderator and three panelists: 4""",
+    "speaker_gender": """Identify the perceived gender of the speaker(s).
+If multiple speakers, list each speaker with their perceived gender.
+Choose from:
+- Male
+- Female
+- Non-binary/unclear
+- Multiple speakers with mixed genders""",
+    "speaker_age": """Identify the age group of the speaker.
+Choose exactly one label from the following categories:
+- Child: under 13 years
+- Teen: 13-19 years
+- Young adult: 20-35 years
+- Middle-aged adult: 36-55 years
+- Senior: over 55 years""",
+    "speech_impairment": """Assess whether there are any noticeable speech impairments or disorders in the speaker's voice.
+Choose exactly one category:
+- No apparent impairment: typical speech patterns
+- Stuttering/disfluency: repetitions, blocks, or prolongations of sounds
+- Articulation disorder: difficulty with specific speech sounds
+- Voice disorder: abnormal pitch, loudness, or quality
+- Fluency disorder: atypical rhythm, rate, or flow of speech
+- Foreign accent: non-native pronunciation patterns
+- Dysarthria: slurred or unclear speech from muscle weakness
+- Apraxia: difficulty with motor planning for speech
+- Other impairment: speech pattern that suggests a different disorder""",
+    # Voice Properties
+    "voice_pitch": """Analyze the voice pitch/tone of the speaker.
+Choose exactly one category from the following:
+- Very high: significantly higher than average for their perceived gender
+- High: noticeably above average pitch 
+- Medium: average pitch range
+- Low: noticeably below average pitch
+- Very low: significantly lower than average for their perceived gender""",
+    "pitch_range": """Assess the pitch variation/intonation range in the speaker's voice.
+Choose exactly one category:
+- Wide range: highly expressive with significant variation between high and low tones
+- Moderate range: normal variation in pitch during speech
+- Narrow range: minimal pitch variation, relatively monotone delivery
+- Monotone: almost no pitch variation""",
+    "voice_type": """Identify the dominant voice quality-related characteristic of the speaker.
+Choose exactly one category:
+- Clear: clean vocal production without noticeable texture issues
+- Breathy: voice has audible breath sounds, less vocal cord closure
+- Creaky/vocal fry: low-frequency rattling sound, especially at ends of phrases
+- Hoarse: rough, raspy quality indicating vocal strain
+- Nasal: voice resonates primarily through the nose
+- Pressed/tense: strained quality from excessive vocal cord pressure
+- Resonant: rich, vibrant voice with good projection
+- Whispered: intentionally quiet with minimal vocal cord vibration
+- Tremulous: shaky or quivery voice quality""",
+    "speech_volume_level": """Assess the overall volume or loudness level of the speaker.
+Choose exactly one category:
+- Very quiet: barely audible, whispering or very soft-spoken
+- Quiet: below average volume, soft-spoken
+- Moderate: normal conversational volume
+- Loud: above average volume, projecting voice
+- Very loud: shouting or extremely high volume
+- Variable: significant changes in volume throughout the recording""",
+    # Speech Content
+    "language": """Identify all languages spoken in the audio.
+List languages using their English names.
+Choose from common languages:
+- English
+- Spanish
+- Mandarin Chinese
+- Hindi
+- Arabic
+- French
+- Russian
+- Portuguese
+- German
+- Japanese
+- Other (specify if possible)""",
+    "speech_register": """Determine the speech register used by the speaker.
+Choose exactly one category:
+- Formal register: careful pronunciation, complex grammar, specialized vocabulary
+- Standard register: proper grammar and pronunciation for professional or educational contexts
+- Consultative register: mixture of formal and casual for everyday professional interactions
+- Casual register: relaxed grammar, contractions, colloquialisms for friends/family
+- Intimate register: highly familiar language used with close relations
+- Technical register: specialized terminology for a specific field or profession
+- Slang register: highly informal with group-specific vocabulary""",
+    "vocabulary_complexity": """Evaluate the vocabulary complexity level in the speech.
+Choose exactly one category:
+- Basic: simple, everyday vocabulary, mostly high-frequency words
+- General: standard vocabulary for common topics, occasional advanced words
+- Advanced: sophisticated vocabulary with specific terminology
+- Technical: specialized/domain-specific terminology
+- Academic: scholarly vocabulary with abstract concepts""",
+    "speech_purpose": """Identify the primary purpose of the speech.
+Choose one category:
+- Informative: primarily explains or educates
+- Persuasive: attempts to convince or change opinions
+- Entertainment: primarily aims to amuse or entertain
+- Narrative: tells a story or relates events
+- Conversational: casual exchange of information
+- Instructional: provides specific directions or guidance
+- Emotional expression: primarily conveys feelings or emotional state""",
+    # Speech Delivery
+    "speech_emotion": """Identify the dominant emotion expressed in this speech.
+Choose exactly one label from the following categories:
+- Neutral: even-toned, matter-of-fact delivery with minimal emotional expression
+- Happy: upbeat, positive, enthusiastic tone
+- Sad: downcast, melancholic, somber tone
+- Angry: irritated, frustrated, hostile tone  
+- Fearful: anxious, worried, frightened tone
+- Surprised: astonished, shocked tone
+- Disgusted: repulsed, revolted tone
+- Other: other emotion that cannot be classified by above classes""",
+    "speech_clarity": """Rate the overall clarity and intelligibility of the speech.
+Choose one category:
+- High clarity: perfectly intelligible, professional quality
+- Medium clarity: generally understandable with occasional unclear segments
+- Low clarity: difficult to understand, frequent unclear segments
+- Very low clarity: mostly unintelligible""",
+    "speech_rate": """Assess the rate of speech in the audio.
+Choose one category:
+- Very slow: deliberate, significantly slower than average speech
+- Slow: relaxed pace, slower than conversational speech
+- Medium: average conversational pace
+- Fast: quicker than average conversational speech
+- Very fast: rapid delivery, difficult to follow""",
+    "speaking_style": """Identify the predominant speaking style of the speaker.
+Choose exactly one category:
+- Formal: structured, proper, adherence to linguistic conventions
+- Professional: clear, efficient communication focused on task/topic
+- Casual/conversational: relaxed, everyday speech
+- Animated/enthusiastic: highly energetic, expressive speech
+- Deliberate: careful, measured delivery
+- Dramatic: theatrical, performance-oriented speech
+- Authoritative: commanding, confident tone
+- Hesitant: uncertain, tentative speech with pauses""",
+    "laughter_crying": """Identify if there is laughter, crying, or other emotional vocalizations in the audio.
+Choose exactly one category:
+- No laughter or crying: speech only
+- Contains laughter: audible laughter is present
+- Contains crying: audible crying or sobbing is present
+- Contains both: both laughter and crying are present
+- Contains other emotional sounds: sighs, gasps, etc.
+- Contains multiple emotional vocalizations: combination of various emotional sounds""",
+    # Interaction Patterns
+    "overlapping_speech": """Determine if there is overlapping speech in the audio (people talking simultaneously).
+Choose exactly one category:
+- No overlap: clean turn-taking with no simultaneous speech
+- Minimal overlap: occasional brief instances of overlapping speech
+- Moderate overlap: noticeable instances where speakers talk over each other
+- Significant overlap: frequent overlapping speech, making it difficult to follow
+- Constant overlap: multiple speakers talking simultaneously throughout most of the audio""",
+    # Recording Environment
+    "speech_background_environment": """Identify the dominant background environment or setting.
+Choose one category:
+- Quiet indoor: minimal background noise, likely studio environment
+- Noisy indoor: indoor setting with noticeable background sounds (cafe, office)
+- Outdoor urban: city sounds, traffic
+- Outdoor natural: nature sounds, birds, wind, water
+- Event/crowd: audience sounds, applause, crowd noise
+- Music background: music playing behind speech
+- Multiple environments: changes throughout recording""",
+    "recording_quality": """Assess the technical quality of the audio recording.
+Choose one category:
+- Professional: studio-quality, broadcast standard
+- Good: clear recording with minimal issues
+- Fair: noticeable recording artifacts but generally clear
+- Poor: significant recording issues affecting comprehension
+- Very poor: severe technical problems making content difficult to understand""",
+    "channel_type": """Identify the likely recording channel or device type used to record this audio.
+Choose exactly one category:
+- Professional microphone: high-quality, full-range audio
+- Consumer microphone: decent quality but less clarity than professional
+- Smartphone: typical mobile phone recording quality
+- Telephone/VoIP: limited frequency range, compression artifacts
+- Webcam/computer mic: variable quality, often with computer fan noise
+- Headset microphone: close to mouth, may have breathing sounds
+- Distant microphone: recorded from a distance, may have room echo
+- Radio/broadcast: compressed audio with limited frequency range
+- Surveillance/hidden mic: typically lower quality with background noise""",
+    # Vocal Evaluation
+    "singing_technique": """You are an expert in vocal performance and singing technique.
+Given the following audio clip of a singing voice, your task is to identify the predominant singing style used.
+Choose one of the following seven styles based on the vocal characteristics:
+
+Breathy: Light, airy voice with noticeable breathiness.
+Falsetto: High-pitched, flute-like sound, especially for male voices.
+Mixed Voice: A blend of chest and head voice, balanced resonance.
+Pharyngeal: Focused, twangy tone with forward placement in the pharynx.
+Glissando: Smooth, sliding transitions between notes.
+Vibrato: Regular, pulsating pitch variation while sustaining a note.
+Control: A neutral, well-supported tone without stylistic effects.
+
+Carefully listen to the tone quality, pitch control, resonance, and transitions in the audio.
+Then, output only the predicted singing style from the list above.
+""",
+}
+
+
+class Processor:
+    """Preprocessor for GLM-4-Voice audio tokenization.
+    """
+
+    def __init__(self, whisper_model: WhisperVQEncoder, feature_extractor: WhisperFeatureExtractor, text_processor: AutoTokenizer):
+        self.whisper_model = whisper_model
+        self.feature_extractor = feature_extractor
+        self.text_tokenizer = text_processor
+        
+def glm_4_voice_model_setup(
+    model_tag: str = "THUDM/glm-4-voice-9b",
+    tokenizer_tag: str = "THUDM/glm-4-voice-tokenizer",
+    dtype: str = "bfloat16",
+    start_prompt: str = "The following is a conversation with an AI assistant. The assistant is helpful, honest, and harmless.",
+) -> Dict[str, Any]:
+    """Set up the GLM-4-Voice model for speech analysis.
+
+    Args:
+        model_tag: Model identifier for GLM-4-Voice, defaults to THUDM/glm-4-voice-9b
+        start_prompt: Initial system prompt for the model conversation
+
+    Returns:
+        Dictionary containing model, processor, and conversation starter
+    """
+    if model_tag == "default":
+        model_tag = "THUDM/glm-4-voice-9b"
+        tokenizer_tag = "THUDM/glm-4-voice-tokenizer"
+    # Can not download the model checkpoint from the Hugging Face Hub
+    # Use the local model checkpoint for quick testing
+    # tokenizer_tag = "/home/stan/GLM-4-Voice/model_checkpoint/glm-4-voice-tokenizer"
+        
+    if  AutoModel is None or  AutoTokenizer is None or BitsAndBytesConfig is None or WhisperFeatureExtractor is None:
+        raise RuntimeError(
+            "GLM-4-Voice is used for evaluation while the transformers library is not installed."
+        )
+    
+    device_id = torch.cuda.current_device() 
+    device = torch.device(f"cuda:{device_id}")
+    bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16
+    ) if dtype == "int4" else None
+    
+    model = AutoModel.from_pretrained(
+            model_tag,
+            trust_remote_code=True,
+            quantization_config=bnb_config if bnb_config else None,
+            device_map={"": device_id}
+        ).eval()
+    text_processor = AutoTokenizer.from_pretrained(model_tag, trust_remote_code=True)
+    
+    whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_tag).eval().to(device)
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_tag)
+    processor = Processor(whisper_model, feature_extractor, text_processor)
+    sampling_params = {
+        "temperature": 1.0,
+        "top_p": 1.0,
+    }
+    
+    start_conversation = [
+        {"role": "assistant", "message_type": "text", "content": start_prompt},
+    ]
+    return {
+        "model": model,
+        "processor": processor,
+        "sampling_params": sampling_params,
+        "start_conversation": start_conversation,
+    }
+
+
+def glm_4_voice_base_metric(
+    glm_4_voice_utils: Dict[str, Any],
+    pred_x: np.ndarray,
+    fs: int = 16000,
+    custom_prompt: Optional[str] = None,
+    max_length: int = 1000,
+) -> str:
+    """Calculate the base metric from GLM-4-Voice results.
+
+    Args:
+        glm_4_voice_utils: A utility dict for GLM-4-Voice calculation containing:
+            'model', 'sampling_params', and 'start_conversation'
+        pred_x: Test signal (time,)
+        fs: Sampling rate in Hz
+        custom_prompt: Custom prompt for the model
+        max_length: Maximum length for model generation
+
+    Returns:
+        Model's response as a string
+    """
+    if custom_prompt is None:
+        raise ValueError("Custom prompt must be provided for the GLM-4-Voice model.")
+
+    conversation = copy.deepcopy(glm_4_voice_utils["start_conversation"])
+    processor = glm_4_voice_utils["processor"] # The preprocessor is for audio tokenization
+    model = glm_4_voice_utils["model"]
+    sampling_params = glm_4_voice_utils["sampling_params"]
+    temperature = float(sampling_params.get("temperature", 1.0))
+    top_p = float(sampling_params.get("top_p", 1.0))
+
+    # Audio preprocess
+    audio = torch.from_numpy(pred_x).unsqueeze(0).to(model.device)
+    audio_tokens = extract_speech_token(
+        processor.whisper_model, processor.feature_extractor, [(audio, fs)]
+    )[0]
+    
+    audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
+    audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
+    audio = audio_tokens
+
+    # Text preprocess
+    # Need to check the format later
+    
+    input_text = conversation[0]["content"] + "\n" + custom_prompt 
+    # print(f"Input text: {input_text}")
+    
+    user_input = f"<|system|>\n{input_text}\n<|user|>\n{audio}<|assistant|>streaming_transcription\n"
+    
+    inputs = processor.text_tokenizer([user_input], return_tensors="pt")
+    inputs = inputs.to(model.device)
+    
+    
+    response = model.generate(
+        **inputs,
+        max_new_tokens=max_length,
+        temperature=temperature,
+        top_p=top_p,
+    )[0]
+    text_tokens = []
+    audio_offset = processor.text_tokenizer.convert_tokens_to_ids('<|audio_0|>')
+    for token_id in response:
+        # The model should not generate audio tokens 
+        if token_id >= audio_offset:
+            continue
+        else:
+            text_tokens.append(token_id)
+    response = processor.text_tokenizer.decode(text_tokens, ignore_special_tokens=False)
+    
+    return response
+
+
+def create_metric_fn(metric_name: str) -> callable:
+    """Factory function to create metric functions.
+
+    Args:
+        metric_name: Name of the metric to create a function for
+
+    Returns:
+        Function that calculates the specified metric
+    """
+
+    def metric_fn(
+        glm_4_voice_utils: Dict[str, Any],
+        pred_x: np.ndarray,
+        fs: int = 16000,
+        custom_prompt: Optional[str] = None,
+    ) -> Dict[str, str]:
+        """Calculate the specified metric from GLM-4-Voice results.
+
+        Args:
+            glm_4_voice_utils: A utility dict for GLM-4-Voice calculation
+            pred_x: Test signal (time,)
+            fs: Sampling rate in Hz
+            custom_prompt: Custom prompt for the model
+
+        Returns:
+            Dictionary containing the metric result
+        """
+        if custom_prompt is None:
+            custom_prompt = DEFAULT_PROMPTS.get(metric_name)
+            if custom_prompt is None:
+                raise ValueError(f"No default prompt found for metric: {metric_name}")
+
+        response = glm_4_voice_base_metric(glm_4_voice_utils, pred_x, fs, custom_prompt)
+        return {f"glm_4_voice_{metric_name}": response}
+
+    return metric_fn
+
+
+# Create metric functions for all categories
+# 1. Speaker Characteristics
+glm_4_voice_speaker_count_metric = create_metric_fn("speaker_count")
+glm_4_voice_speaker_gender_metric = create_metric_fn("speaker_gender")
+glm_4_voice_speaker_age_metric = create_metric_fn("speaker_age")
+glm_4_voice_speech_impairment_metric = create_metric_fn("speech_impairment")
+
+# 2. Voice Properties
+glm_4_voice_voice_pitch_metric = create_metric_fn("voice_pitch")
+glm_4_voice_pitch_range_metric = create_metric_fn("pitch_range")
+glm_4_voice_voice_type_metric = create_metric_fn("voice_type")
+glm_4_voice_speech_volume_level_metric = create_metric_fn("speech_volume_level")
+
+# 3. Speech Content
+glm_4_voice_language_metric = create_metric_fn("language")
+glm_4_voice_speech_register_metric = create_metric_fn("speech_register")
+glm_4_voice_vocabulary_complexity_metric = create_metric_fn("vocabulary_complexity")
+glm_4_voice_speech_purpose_metric = create_metric_fn("speech_purpose")
+
+# 4. Speech Delivery
+glm_4_voice_speech_emotion_metric = create_metric_fn("speech_emotion")
+glm_4_voice_speech_clarity_metric = create_metric_fn("speech_clarity")
+glm_4_voice_speech_rate_metric = create_metric_fn("speech_rate")
+glm_4_voice_speaking_style_metric = create_metric_fn("speaking_style")
+glm_4_voice_laughter_crying_metric = create_metric_fn("laughter_crying")
+
+# 5. Interaction Patterns
+glm_4_voice_overlapping_speech_metric = create_metric_fn("overlapping_speech")
+
+# 6. Recording Environment
+glm_4_voice_speech_background_environment_metric = create_metric_fn(
+    "speech_background_environment"
+)
+glm_4_voice_recording_quality_metric = create_metric_fn("recording_quality")
+glm_4_voice_channel_type_metric = create_metric_fn("channel_type")
+
+# 7. Vocal Evaluation
+glm_4_voice_singing_technique_metric = create_metric_fn("singing_technique")
+
+if __name__ == "__main__":
+    a = np.random.random(16000)
+    glm_4_voice_utils = glm_4_voice_model_setup()
+    all_metrics = [
+        glm_4_voice_speaker_count_metric,
+        glm_4_voice_speaker_gender_metric,
+        glm_4_voice_speaker_age_metric,
+        glm_4_voice_speech_impairment_metric,
+        glm_4_voice_voice_pitch_metric,
+        glm_4_voice_pitch_range_metric,
+        glm_4_voice_voice_type_metric,
+        glm_4_voice_speech_volume_level_metric,
+        glm_4_voice_language_metric,
+        glm_4_voice_speech_register_metric,
+        glm_4_voice_vocabulary_complexity_metric,
+        glm_4_voice_speech_purpose_metric,
+        glm_4_voice_speech_emotion_metric,
+        glm_4_voice_speech_clarity_metric,
+        glm_4_voice_speech_rate_metric,
+        glm_4_voice_speaking_style_metric,
+        glm_4_voice_laughter_crying_metric,
+        glm_4_voice_overlapping_speech_metric,
+        glm_4_voice_speech_background_environment_metric,
+        glm_4_voice_recording_quality_metric,
+        glm_4_voice_channel_type_metric,
+        glm_4_voice_singing_technique_metric,
+    ]
+
+    for fn in all_metrics:
+        print("metrics: {}".format(fn(glm_4_voice_utils, a, 16000)))
+        break
+    # print("metrics: {}".format(glm_4_voice_speech_emotion_metric(glm_4_voice_utils, a, 16000)))

From cdd5b9d8b678ae5235e7be983517b4e66a183341 Mon Sep 17 00:00:00 2001
From: Stanwang1210 <ch995308@gmail.com>
Date: Wed, 26 Nov 2025 09:53:35 +0000
Subject: [PATCH 2/2] add dependency instruction

---
 versa/utterance_metrics/glm_4_voice.py | 232 +++----------------------
 1 file changed, 20 insertions(+), 212 deletions(-)

diff --git a/versa/utterance_metrics/glm_4_voice.py b/versa/utterance_metrics/glm_4_voice.py
index 9361621..502a7b0 100644
--- a/versa/utterance_metrics/glm_4_voice.py
+++ b/versa/utterance_metrics/glm_4_voice.py
@@ -73,218 +73,31 @@
     )
     AutoModel, AutoTokenizer, WhisperFeatureExtractor, BitsAndBytesConfig = None, None, None, None
 
+from qwen2_audio import DEFAULT_PROMPTS
 # TO-DO: Remove this line when the GLM-4-Voice is installed in the environment
 # This is a workaround to ensure the GLM-4-Voice model can be imported correctly
-library_path = os.path.expanduser("~/GLM-4-Voice")
-if library_path not in sys.path:
-    sys.path.insert(0, library_path)
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.abspath(os.path.join(current_dir, "../.."))
+library_path = os.path.join(project_root, "GLM-4-Voice")
+
+if os.path.exists(library_path):
+    if library_path not in sys.path:
+        sys.path.insert(0, library_path)
+else:
+    logging.warning(f"GLM-4-Voice repository not found at {library_path}. Please run: git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice in the project root.")
+
+try:
+    from speech_tokenizer.modeling_whisper import WhisperVQEncoder
+    from speech_tokenizer.utils import extract_speech_token
+except ImportError:
+    logging.error("Failed to import GLM-4-Voice modules. Ensure the repository is cloned correctly.")
+    WhisperVQEncoder = None
+    extract_speech_token = None
 
 from speech_tokenizer.modeling_whisper import WhisperVQEncoder
 from speech_tokenizer.utils import extract_speech_token
 # from audio_process import AudioStreamProcessor
 # Default prompts for different metrics
-DEFAULT_PROMPTS = {
-    # Speaker Characteristics
-    "speaker_count": """Analyze the audio and determine the number of distinct speakers present.
-Provide your answer as a single number between 1-10.
-Examples:
-- For a monologue: 1
-- For an interview with host and guest: 2 
-- For a panel discussion with a moderator and three panelists: 4""",
-    "speaker_gender": """Identify the perceived gender of the speaker(s).
-If multiple speakers, list each speaker with their perceived gender.
-Choose from:
-- Male
-- Female
-- Non-binary/unclear
-- Multiple speakers with mixed genders""",
-    "speaker_age": """Identify the age group of the speaker.
-Choose exactly one label from the following categories:
-- Child: under 13 years
-- Teen: 13-19 years
-- Young adult: 20-35 years
-- Middle-aged adult: 36-55 years
-- Senior: over 55 years""",
-    "speech_impairment": """Assess whether there are any noticeable speech impairments or disorders in the speaker's voice.
-Choose exactly one category:
-- No apparent impairment: typical speech patterns
-- Stuttering/disfluency: repetitions, blocks, or prolongations of sounds
-- Articulation disorder: difficulty with specific speech sounds
-- Voice disorder: abnormal pitch, loudness, or quality
-- Fluency disorder: atypical rhythm, rate, or flow of speech
-- Foreign accent: non-native pronunciation patterns
-- Dysarthria: slurred or unclear speech from muscle weakness
-- Apraxia: difficulty with motor planning for speech
-- Other impairment: speech pattern that suggests a different disorder""",
-    # Voice Properties
-    "voice_pitch": """Analyze the voice pitch/tone of the speaker.
-Choose exactly one category from the following:
-- Very high: significantly higher than average for their perceived gender
-- High: noticeably above average pitch 
-- Medium: average pitch range
-- Low: noticeably below average pitch
-- Very low: significantly lower than average for their perceived gender""",
-    "pitch_range": """Assess the pitch variation/intonation range in the speaker's voice.
-Choose exactly one category:
-- Wide range: highly expressive with significant variation between high and low tones
-- Moderate range: normal variation in pitch during speech
-- Narrow range: minimal pitch variation, relatively monotone delivery
-- Monotone: almost no pitch variation""",
-    "voice_type": """Identify the dominant voice quality-related characteristic of the speaker.
-Choose exactly one category:
-- Clear: clean vocal production without noticeable texture issues
-- Breathy: voice has audible breath sounds, less vocal cord closure
-- Creaky/vocal fry: low-frequency rattling sound, especially at ends of phrases
-- Hoarse: rough, raspy quality indicating vocal strain
-- Nasal: voice resonates primarily through the nose
-- Pressed/tense: strained quality from excessive vocal cord pressure
-- Resonant: rich, vibrant voice with good projection
-- Whispered: intentionally quiet with minimal vocal cord vibration
-- Tremulous: shaky or quivery voice quality""",
-    "speech_volume_level": """Assess the overall volume or loudness level of the speaker.
-Choose exactly one category:
-- Very quiet: barely audible, whispering or very soft-spoken
-- Quiet: below average volume, soft-spoken
-- Moderate: normal conversational volume
-- Loud: above average volume, projecting voice
-- Very loud: shouting or extremely high volume
-- Variable: significant changes in volume throughout the recording""",
-    # Speech Content
-    "language": """Identify all languages spoken in the audio.
-List languages using their English names.
-Choose from common languages:
-- English
-- Spanish
-- Mandarin Chinese
-- Hindi
-- Arabic
-- French
-- Russian
-- Portuguese
-- German
-- Japanese
-- Other (specify if possible)""",
-    "speech_register": """Determine the speech register used by the speaker.
-Choose exactly one category:
-- Formal register: careful pronunciation, complex grammar, specialized vocabulary
-- Standard register: proper grammar and pronunciation for professional or educational contexts
-- Consultative register: mixture of formal and casual for everyday professional interactions
-- Casual register: relaxed grammar, contractions, colloquialisms for friends/family
-- Intimate register: highly familiar language used with close relations
-- Technical register: specialized terminology for a specific field or profession
-- Slang register: highly informal with group-specific vocabulary""",
-    "vocabulary_complexity": """Evaluate the vocabulary complexity level in the speech.
-Choose exactly one category:
-- Basic: simple, everyday vocabulary, mostly high-frequency words
-- General: standard vocabulary for common topics, occasional advanced words
-- Advanced: sophisticated vocabulary with specific terminology
-- Technical: specialized/domain-specific terminology
-- Academic: scholarly vocabulary with abstract concepts""",
-    "speech_purpose": """Identify the primary purpose of the speech.
-Choose one category:
-- Informative: primarily explains or educates
-- Persuasive: attempts to convince or change opinions
-- Entertainment: primarily aims to amuse or entertain
-- Narrative: tells a story or relates events
-- Conversational: casual exchange of information
-- Instructional: provides specific directions or guidance
-- Emotional expression: primarily conveys feelings or emotional state""",
-    # Speech Delivery
-    "speech_emotion": """Identify the dominant emotion expressed in this speech.
-Choose exactly one label from the following categories:
-- Neutral: even-toned, matter-of-fact delivery with minimal emotional expression
-- Happy: upbeat, positive, enthusiastic tone
-- Sad: downcast, melancholic, somber tone
-- Angry: irritated, frustrated, hostile tone  
-- Fearful: anxious, worried, frightened tone
-- Surprised: astonished, shocked tone
-- Disgusted: repulsed, revolted tone
-- Other: other emotion that cannot be classified by above classes""",
-    "speech_clarity": """Rate the overall clarity and intelligibility of the speech.
-Choose one category:
-- High clarity: perfectly intelligible, professional quality
-- Medium clarity: generally understandable with occasional unclear segments
-- Low clarity: difficult to understand, frequent unclear segments
-- Very low clarity: mostly unintelligible""",
-    "speech_rate": """Assess the rate of speech in the audio.
-Choose one category:
-- Very slow: deliberate, significantly slower than average speech
-- Slow: relaxed pace, slower than conversational speech
-- Medium: average conversational pace
-- Fast: quicker than average conversational speech
-- Very fast: rapid delivery, difficult to follow""",
-    "speaking_style": """Identify the predominant speaking style of the speaker.
-Choose exactly one category:
-- Formal: structured, proper, adherence to linguistic conventions
-- Professional: clear, efficient communication focused on task/topic
-- Casual/conversational: relaxed, everyday speech
-- Animated/enthusiastic: highly energetic, expressive speech
-- Deliberate: careful, measured delivery
-- Dramatic: theatrical, performance-oriented speech
-- Authoritative: commanding, confident tone
-- Hesitant: uncertain, tentative speech with pauses""",
-    "laughter_crying": """Identify if there is laughter, crying, or other emotional vocalizations in the audio.
-Choose exactly one category:
-- No laughter or crying: speech only
-- Contains laughter: audible laughter is present
-- Contains crying: audible crying or sobbing is present
-- Contains both: both laughter and crying are present
-- Contains other emotional sounds: sighs, gasps, etc.
-- Contains multiple emotional vocalizations: combination of various emotional sounds""",
-    # Interaction Patterns
-    "overlapping_speech": """Determine if there is overlapping speech in the audio (people talking simultaneously).
-Choose exactly one category:
-- No overlap: clean turn-taking with no simultaneous speech
-- Minimal overlap: occasional brief instances of overlapping speech
-- Moderate overlap: noticeable instances where speakers talk over each other
-- Significant overlap: frequent overlapping speech, making it difficult to follow
-- Constant overlap: multiple speakers talking simultaneously throughout most of the audio""",
-    # Recording Environment
-    "speech_background_environment": """Identify the dominant background environment or setting.
-Choose one category:
-- Quiet indoor: minimal background noise, likely studio environment
-- Noisy indoor: indoor setting with noticeable background sounds (cafe, office)
-- Outdoor urban: city sounds, traffic
-- Outdoor natural: nature sounds, birds, wind, water
-- Event/crowd: audience sounds, applause, crowd noise
-- Music background: music playing behind speech
-- Multiple environments: changes throughout recording""",
-    "recording_quality": """Assess the technical quality of the audio recording.
-Choose one category:
-- Professional: studio-quality, broadcast standard
-- Good: clear recording with minimal issues
-- Fair: noticeable recording artifacts but generally clear
-- Poor: significant recording issues affecting comprehension
-- Very poor: severe technical problems making content difficult to understand""",
-    "channel_type": """Identify the likely recording channel or device type used to record this audio.
-Choose exactly one category:
-- Professional microphone: high-quality, full-range audio
-- Consumer microphone: decent quality but less clarity than professional
-- Smartphone: typical mobile phone recording quality
-- Telephone/VoIP: limited frequency range, compression artifacts
-- Webcam/computer mic: variable quality, often with computer fan noise
-- Headset microphone: close to mouth, may have breathing sounds
-- Distant microphone: recorded from a distance, may have room echo
-- Radio/broadcast: compressed audio with limited frequency range
-- Surveillance/hidden mic: typically lower quality with background noise""",
-    # Vocal Evaluation
-    "singing_technique": """You are an expert in vocal performance and singing technique.
-Given the following audio clip of a singing voice, your task is to identify the predominant singing style used.
-Choose one of the following seven styles based on the vocal characteristics:
-
-Breathy: Light, airy voice with noticeable breathiness.
-Falsetto: High-pitched, flute-like sound, especially for male voices.
-Mixed Voice: A blend of chest and head voice, balanced resonance.
-Pharyngeal: Focused, twangy tone with forward placement in the pharynx.
-Glissando: Smooth, sliding transitions between notes.
-Vibrato: Regular, pulsating pitch variation while sustaining a note.
-Control: A neutral, well-supported tone without stylistic effects.
-
-Carefully listen to the tone quality, pitch control, resonance, and transitions in the audio.
-Then, output only the predicted singing style from the list above.
-""",
-}
-
 
 class Processor:
     """Preprocessor for GLM-4-Voice audio tokenization.
@@ -313,9 +126,6 @@ def glm_4_voice_model_setup(
     if model_tag == "default":
         model_tag = "THUDM/glm-4-voice-9b"
         tokenizer_tag = "THUDM/glm-4-voice-tokenizer"
-    # Can not download the model checkpoint from the Hugging Face Hub
-    # Use the local model checkpoint for quick testing
-    # tokenizer_tag = "/home/stan/GLM-4-Voice/model_checkpoint/glm-4-voice-tokenizer"
         
     if  AutoModel is None or  AutoTokenizer is None or BitsAndBytesConfig is None or WhisperFeatureExtractor is None:
         raise RuntimeError(
@@ -424,8 +234,8 @@ def glm_4_voice_base_metric(
             continue
         else:
             text_tokens.append(token_id)
-    response = processor.text_tokenizer.decode(text_tokens, ignore_special_tokens=False)
-    
+    responses = processor.text_tokenizer.decode(text_tokens, ignore_special_tokens=True)
+    response = responses.split("<|assistant|>")[-1].split("\n")[-1].replace('<|user|>', '').strip()
     return response
 
 
@@ -536,5 +346,3 @@ def metric_fn(
 
     for fn in all_metrics:
         print("metrics: {}".format(fn(glm_4_voice_utils, a, 16000)))
-        break
-    # print("metrics: {}".format(glm_4_voice_speech_emotion_metric(glm_4_voice_utils, a, 16000)))