From 9201976394523a00ae40b9f83a920726d14e27c2 Mon Sep 17 00:00:00 2001 From: Stanwang1210 Date: Sat, 21 Jun 2025 08:54:46 +0000 Subject: [PATCH 1/2] init kimi --- versa/utterance_metrics/kimi_audio.py | 488 ++++++++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 versa/utterance_metrics/kimi_audio.py diff --git a/versa/utterance_metrics/kimi_audio.py b/versa/utterance_metrics/kimi_audio.py new file mode 100644 index 0000000..d80e83c --- /dev/null +++ b/versa/utterance_metrics/kimi_audio.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Jiatong Shi +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +Speech Properties for Metadata Modeling (Kimi-based) + +This module provides functions for extracting various speech properties +from audio using Kimi-Audio. The properties are organized into the +following categories: + +1. Speaker Characteristics + - kimi_speaker_count_metric: Number of distinct speakers + - kimi_speaker_gender_metric: Gender of speaker(s) + - kimi_speaker_age_metric: Age group of speaker(s) + - kimi_speech_impairment_metric: Presence and type of speech disorders + +2. Voice Properties + - kimi_voice_pitch_metric: Overall pitch level + - kimi_pitch_range_metric: Variation in intonation + - kimi_voice_type_metric: Voice texture characteristics + - kimi_speech_volume_level_metric: Loudness of speech + +3. Speech Content + - kimi_language_metric: Language(s) being spoken + - kimi_speech_register_metric: Level of formality in speech + - kimi_vocabulary_complexity_metric: Sophistication of word choice + - kimi_speech_purpose_metric: Communicative goal of speech + +4. Speech Delivery + - kimi_speech_emotion_metric: Emotional state conveyed + - kimi_speech_clarity_metric: Intelligibility of speech + - kimi_speech_rate_metric: Speed of delivery + - kimi_speaking_style_metric: Overall presentation manner + - kimi_laughter_crying_metric: Presence of emotional vocalizations + +5. Interaction Patterns + - kimi_overlapping_speech_metric: Degree of simultaneous speech + +6. Recording Environment + - kimi_speech_background_environment_metric: Setting where recorded + - kimi_recording_quality_metric: Technical quality of recording + - kimi_channel_type_metric: Equipment used for recording + +7. Vocal Evaluation + - kimi_singing_technique_metric: Singing Techniques (styles) + +Each function follows the same signature pattern: + kimi_utils: Dictionary containing model, processor, and conversation + pred_x: Audio signal as numpy array + fs: Sampling rate in Hz (default 16000) + custom_prompt: Optional custom prompt to override default + +Each function returns a dictionary with a single key-value pair where +the key is the metric name prefixed with "kimi_" and the value is the +model's response. +""" + +import copy +import logging +from typing import Dict, Optional, Any + +import librosa +import numpy as np +import torch + +try: + from kimia_infer.api.kimia import KimiAudio + from kimia_infer.models.tokenizer.glm4_tokenizer import Glm4Tokenizer +except ImportError: + logging.warning( + "If KimiAudio is not found with key error, please install the latest version of Kimi-Audio and retry." + ) + KimiAudio, Glm4Tokenizer = None, None + + +# Default prompts for different metrics +DEFAULT_PROMPTS = { + # Speaker Characteristics + "speaker_count": """Analyze the audio and determine the number of distinct speakers present. +Provide your answer as a single number between 1-10. +Examples: +- For a monologue: 1 +- For an interview with host and guest: 2 +- For a panel discussion with a moderator and three panelists: 4""", + "speaker_gender": """Identify the perceived gender of the speaker(s). +If multiple speakers, list each speaker with their perceived gender. +Choose from: +- Male +- Female +- Non-binary/unclear +- Multiple speakers with mixed genders""", + "speaker_age": """Identify the age group of the speaker. +Choose exactly one label from the following categories: +- Child: under 13 years +- Teen: 13-19 years +- Young adult: 20-35 years +- Middle-aged adult: 36-55 years +- Senior: over 55 years""", + "speech_impairment": """Assess whether there are any noticeable speech impairments or disorders in the speaker's voice. +Choose exactly one category: +- No apparent impairment: typical speech patterns +- Stuttering/disfluency: repetitions, blocks, or prolongations of sounds +- Articulation disorder: difficulty with specific speech sounds +- Voice disorder: abnormal pitch, loudness, or quality +- Fluency disorder: atypical rhythm, rate, or flow of speech +- Foreign accent: non-native pronunciation patterns +- Dysarthria: slurred or unclear speech from muscle weakness +- Apraxia: difficulty with motor planning for speech +- Other impairment: speech pattern that suggests a different disorder""", + # Voice Properties + "voice_pitch": """Analyze the voice pitch/tone of the speaker. +Choose exactly one category from the following: +- Very high: significantly higher than average for their perceived gender +- High: noticeably above average pitch +- Medium: average pitch range +- Low: noticeably below average pitch +- Very low: significantly lower than average for their perceived gender""", + "pitch_range": """Assess the pitch variation/intonation range in the speaker's voice. +Choose exactly one category: +- Wide range: highly expressive with significant variation between high and low tones +- Moderate range: normal variation in pitch during speech +- Narrow range: minimal pitch variation, relatively monotone delivery +- Monotone: almost no pitch variation""", + "voice_type": """Identify the dominant voice quality-related characteristic of the speaker. +Choose exactly one category: +- Clear: clean vocal production without noticeable texture issues +- Breathy: voice has audible breath sounds, less vocal cord closure +- Creaky/vocal fry: low-frequency rattling sound, especially at ends of phrases +- Hoarse: rough, raspy quality indicating vocal strain +- Nasal: voice resonates primarily through the nose +- Pressed/tense: strained quality from excessive vocal cord pressure +- Resonant: rich, vibrant voice with good projection +- Whispered: intentionally quiet with minimal vocal cord vibration +- Tremulous: shaky or quivery voice quality""", + "speech_volume_level": """Assess the overall volume or loudness level of the speaker. +Choose exactly one category: +- Very quiet: barely audible, whispering or very soft-spoken +- Quiet: below average volume, soft-spoken +- Moderate: normal conversational volume +- Loud: above average volume, projecting voice +- Very loud: shouting or extremely high volume +- Variable: significant changes in volume throughout the recording""", + # Speech Content + "language": """Identify all languages spoken in the audio. +List languages using their English names. +Choose from common languages: +- English +- Spanish +- Mandarin Chinese +- Hindi +- Arabic +- French +- Russian +- Portuguese +- German +- Japanese +- Other (specify if possible)""", + "speech_register": """Determine the speech register used by the speaker. +Choose exactly one category: +- Formal register: careful pronunciation, complex grammar, specialized vocabulary +- Standard register: proper grammar and pronunciation for professional or educational contexts +- Consultative register: mixture of formal and casual for everyday professional interactions +- Casual register: relaxed grammar, contractions, colloquialisms for friends/family +- Intimate register: highly familiar language used with close relations +- Technical register: specialized terminology for a specific field or profession +- Slang register: highly informal with group-specific vocabulary""", + "vocabulary_complexity": """Evaluate the vocabulary complexity level in the speech. +Choose exactly one category: +- Basic: simple, everyday vocabulary, mostly high-frequency words +- General: standard vocabulary for common topics, occasional advanced words +- Advanced: sophisticated vocabulary with specific terminology +- Technical: specialized/domain-specific terminology +- Academic: scholarly vocabulary with abstract concepts""", + "speech_purpose": """Identify the primary purpose of the speech. +Choose one category: +- Informative: primarily explains or educates +- Persuasive: attempts to convince or change opinions +- Entertainment: primarily aims to amuse or entertain +- Narrative: tells a story or relates events +- Conversational: casual exchange of information +- Instructional: provides specific directions or guidance +- Emotional expression: primarily conveys feelings or emotional state""", + # Speech Delivery + "speech_emotion": """Identify the dominant emotion expressed in this speech. +Choose exactly one label from the following categories: +- Neutral: even-toned, matter-of-fact delivery with minimal emotional expression +- Happy: upbeat, positive, enthusiastic tone +- Sad: downcast, melancholic, somber tone +- Angry: irritated, frustrated, hostile tone +- Fearful: anxious, worried, frightened tone +- Surprised: astonished, shocked tone +- Disgusted: repulsed, revolted tone +- Other: other emotion that cannot be classified by above classes""", + "speech_clarity": """Rate the overall clarity and intelligibility of the speech. +Choose one category: +- High clarity: perfectly intelligible, professional quality +- Medium clarity: generally understandable with occasional unclear segments +- Low clarity: difficult to understand, frequent unclear segments +- Very low clarity: mostly unintelligible""", + "speech_rate": """Assess the rate of speech in the audio. +Choose one category: +- Very slow: deliberate, significantly slower than average speech +- Slow: relaxed pace, slower than conversational speech +- Medium: average conversational pace +- Fast: quicker than average conversational speech +- Very fast: rapid delivery, difficult to follow""", + "speaking_style": """Identify the predominant speaking style of the speaker. +Choose exactly one category: +- Formal: structured, proper, adherence to linguistic conventions +- Professional: clear, efficient communication focused on task/topic +- Casual/conversational: relaxed, everyday speech +- Animated/enthusiastic: highly energetic, expressive speech +- Deliberate: careful, measured delivery +- Dramatic: theatrical, performance-oriented speech +- Authoritative: commanding, confident tone +- Hesitant: uncertain, tentative speech with pauses""", + "laughter_crying": """Identify if there is laughter, crying, or other emotional vocalizations in the audio. +Choose exactly one category: +- No laughter or crying: speech only +- Contains laughter: audible laughter is present +- Contains crying: audible crying or sobbing is present +- Contains both: both laughter and crying are present +- Contains other emotional sounds: sighs, gasps, etc. +- Contains multiple emotional vocalizations: combination of various emotional sounds""", + # Interaction Patterns + "overlapping_speech": """Determine if there is overlapping speech in the audio (people talking simultaneously). +Choose exactly one category: +- No overlap: clean turn-taking with no simultaneous speech +- Minimal overlap: occasional brief instances of overlapping speech +- Moderate overlap: noticeable instances where speakers talk over each other +- Significant overlap: frequent overlapping speech, making it difficult to follow +- Constant overlap: multiple speakers talking simultaneously throughout most of the audio""", + # Recording Environment + "speech_background_environment": """Identify the dominant background environment or setting. +Choose one category: +- Quiet indoor: minimal background noise, likely studio environment +- Noisy indoor: indoor setting with noticeable background sounds (cafe, office) +- Outdoor urban: city sounds, traffic +- Outdoor natural: nature sounds, birds, wind, water +- Event/crowd: audience sounds, applause, crowd noise +- Music background: music playing behind speech +- Multiple environments: changes throughout recording""", + "recording_quality": """Assess the technical quality of the audio recording. +Choose one category: +- Professional: studio-quality, broadcast standard +- Good: clear recording with minimal issues +- Fair: noticeable recording artifacts but generally clear +- Poor: significant recording issues affecting comprehension +- Very poor: severe technical problems making content difficult to understand""", + "channel_type": """Identify the likely recording channel or device type used to record this audio. +Choose exactly one category: +- Professional microphone: high-quality, full-range audio +- Consumer microphone: decent quality but less clarity than professional +- Smartphone: typical mobile phone recording quality +- Telephone/VoIP: limited frequency range, compression artifacts +- Webcam/computer mic: variable quality, often with computer fan noise +- Headset microphone: close to mouth, may have breathing sounds +- Distant microphone: recorded from a distance, may have room echo +- Radio/broadcast: compressed audio with limited frequency range +- Surveillance/hidden mic: typically lower quality with background noise""", + # Vocal Evaluation + "singing_technique": """You are an expert in vocal performance and singing technique. +Given the following audio clip of a singing voice, your task is to identify the predominant singing style used. +Choose one of the following seven styles based on the vocal characteristics: + +Breathy: Light, airy voice with noticeable breathiness. +Falsetto: High-pitched, flute-like sound, especially for male voices. +Mixed Voice: A blend of chest and head voice, balanced resonance. +Pharyngeal: Focused, twangy tone with forward placement in the pharynx. +Glissando: Smooth, sliding transitions between notes. +Vibrato: Regular, pulsating pitch variation while sustaining a note. +Control: A neutral, well-supported tone without stylistic effects. + +Carefully listen to the tone quality, pitch control, resonance, and transitions in the audio. +Then, output only the predicted singing style from the list above. +""", +} + + +def kimi_model_setup( + model_tag: str = "MoonshotAI/Kimi-Audio-7B", + start_prompt: str = "The following is a conversation with an AI assistant. The assistant is helpful, honest, and harmless.", +) -> Dict[str, Any]: + """Set up the Kimi-Audio model for speech analysis. + + Args: + model_tag: Model identifier for Kimi-Audio, defaults to MoonshotAI/Kimi-Audio-7B + start_prompt: Initial system prompt for the model conversation + + Returns: + Dictionary containing model, processor, and conversation starter + """ + if model_tag == "default": + model_tag = "MoonshotAI/Kimi-Audio-7B" + if KimiAudio is None or Glm4Tokenizer is None: + raise RuntimeError( + "Kimi-Audio is used for evaluation while the Kimi library is not installed." + ) + model = KimiAudio(model_path=model_tag, load_detokenizer=True) + processor = model.prompt_manager + sampling_params = { + "audio_temperature": 0.8, + "audio_top_k": 10, + "text_temperature": 0.0, + "text_top_k": 5, + "audio_repetition_penalty": 1.0, + "audio_repetition_window_size": 64, + "text_repetition_penalty": 1.0, + "text_repetition_window_size": 16, + } + start_conversation = [ + {"role": "assistant", "message_type": "text", "content": start_prompt}, + ] + return { + "model": model, + "processor": processor, + "sampling_params": sampling_params, + "start_conversation": start_conversation, + } + + +def kimi_base_metric( + kimi_utils: Dict[str, Any], + pred_x: np.ndarray, + fs: int = 16000, + custom_prompt: Optional[str] = None, + max_length: int = 1000, +) -> str: + """Calculate the base metric from Kimi-Audio results. + + Args: + kimi_utils: A utility dict for Kimi-Audio calculation containing: + 'model', 'sampling_params', and 'start_conversation' + pred_x: Test signal (time,) + fs: Sampling rate in Hz + custom_prompt: Custom prompt for the model + max_length: Maximum length for model generation + + Returns: + Model's response as a string + """ + if custom_prompt is None: + raise ValueError("Custom prompt must be provided for the Kimi-Audio model.") + + conversation = copy.deepcopy(kimi_utils["start_conversation"]) + sampling_params = kimi_utils["sampling_params"] + preprocessor = kimi_utils["processor"] # The preprocessor is for audio tokenization + model = kimi_utils["model"] + + audio = torch.from_numpy(librosa.resample( + pred_x, orig_sr=fs, target_sr=16000, # Kimi-Audio uses 16kHz as default sampling rate + )).unsqueeze(0).to(torch.float32).to(model.alm.device) + + audio_tokens = preprocessor.audio_tokenizer.tokenize(speech=audio) + audio_tokens = audio_tokens + preprocessor.kimia_token_offset + audio_tokens = audio_tokens.squeeze(0).cpu().numpy().tolist() + + conversation.extend( + [ + { + "role": "user", + "message_type": "text", + "content": custom_prompt + }, + { + "role": "user", + "message_type": "audio", + "content": audio, + "audio_tokens": audio_tokens, + } + ] + + ) + + _, response = model.generate(conversation, **sampling_params, max_new_tokens=max_length, output_type="text") + return response + + +def create_metric_fn(metric_name: str) -> callable: + """Factory function to create metric functions. + + Args: + metric_name: Name of the metric to create a function for + + Returns: + Function that calculates the specified metric + """ + + def metric_fn( + kimi_utils: Dict[str, Any], + pred_x: np.ndarray, + fs: int = 16000, + custom_prompt: Optional[str] = None, + ) -> Dict[str, str]: + """Calculate the specified metric from Kimi-Audio results. + + Args: + kimi_utils: A utility dict for Kimi-Audio calculation + pred_x: Test signal (time,) + fs: Sampling rate in Hz + custom_prompt: Custom prompt for the model + + Returns: + Dictionary containing the metric result + """ + if custom_prompt is None: + custom_prompt = DEFAULT_PROMPTS.get(metric_name) + if custom_prompt is None: + raise ValueError(f"No default prompt found for metric: {metric_name}") + + response = kimi_base_metric(kimi_utils, pred_x, fs, custom_prompt) + return {f"kimi_{metric_name}": response} + + return metric_fn + + +# Create metric functions for all categories +# 1. Speaker Characteristics +kimi_speaker_count_metric = create_metric_fn("speaker_count") +kimi_speaker_gender_metric = create_metric_fn("speaker_gender") +kimi_speaker_age_metric = create_metric_fn("speaker_age") +kimi_speech_impairment_metric = create_metric_fn("speech_impairment") + +# 2. Voice Properties +kimi_voice_pitch_metric = create_metric_fn("voice_pitch") +kimi_pitch_range_metric = create_metric_fn("pitch_range") +kimi_voice_type_metric = create_metric_fn("voice_type") +kimi_speech_volume_level_metric = create_metric_fn("speech_volume_level") + +# 3. Speech Content +kimi_language_metric = create_metric_fn("language") +kimi_speech_register_metric = create_metric_fn("speech_register") +kimi_vocabulary_complexity_metric = create_metric_fn("vocabulary_complexity") +kimi_speech_purpose_metric = create_metric_fn("speech_purpose") + +# 4. Speech Delivery +kimi_speech_emotion_metric = create_metric_fn("speech_emotion") +kimi_speech_clarity_metric = create_metric_fn("speech_clarity") +kimi_speech_rate_metric = create_metric_fn("speech_rate") +kimi_speaking_style_metric = create_metric_fn("speaking_style") +kimi_laughter_crying_metric = create_metric_fn("laughter_crying") + +# 5. Interaction Patterns +kimi_overlapping_speech_metric = create_metric_fn("overlapping_speech") + +# 6. Recording Environment +kimi_speech_background_environment_metric = create_metric_fn( + "speech_background_environment" +) +kimi_recording_quality_metric = create_metric_fn("recording_quality") +kimi_channel_type_metric = create_metric_fn("channel_type") + +# 7. Vocal Evaluation +kimi_singing_technique_metric = create_metric_fn("singing_technique") + +if __name__ == "__main__": + a = np.random.random(16000) + kimi_utils = kimi_model_setup() + all_metrics = [ + kimi_speaker_count_metric, + kimi_speaker_gender_metric, + kimi_speaker_age_metric, + kimi_speech_impairment_metric, + kimi_voice_pitch_metric, + kimi_pitch_range_metric, + kimi_voice_type_metric, + kimi_speech_volume_level_metric, + kimi_language_metric, + kimi_speech_register_metric, + kimi_vocabulary_complexity_metric, + kimi_speech_purpose_metric, + kimi_speech_emotion_metric, + kimi_speech_clarity_metric, + kimi_speech_rate_metric, + kimi_speaking_style_metric, + kimi_laughter_crying_metric, + kimi_overlapping_speech_metric, + kimi_speech_background_environment_metric, + kimi_recording_quality_metric, + kimi_channel_type_metric, + kimi_singing_technique_metric, + ] + + for fn in all_metrics: + print("metrics: {}".format(fn(kimi_utils, a, 16000))) + # print("metrics: {}".format(kimi_speech_emotion_metric(kimi_utils, a, 16000))) From c0bd62c0f1b5bf019412ce49407273a70fac6968 Mon Sep 17 00:00:00 2001 From: Stanwang1210 Date: Tue, 25 Nov 2025 23:50:16 +0000 Subject: [PATCH 2/2] fix kimi bug --- versa/utterance_metrics/kimi_audio.py | 284 ++++---------------------- 1 file changed, 43 insertions(+), 241 deletions(-) diff --git a/versa/utterance_metrics/kimi_audio.py b/versa/utterance_metrics/kimi_audio.py index d80e83c..8648392 100644 --- a/versa/utterance_metrics/kimi_audio.py +++ b/versa/utterance_metrics/kimi_audio.py @@ -59,10 +59,12 @@ import copy import logging +import os from typing import Dict, Optional, Any - +import tempfile import librosa import numpy as np +import soundfile as sf import torch try: @@ -74,210 +76,8 @@ ) KimiAudio, Glm4Tokenizer = None, None - +from qwen2_audio import DEFAULT_PROMPTS # Default prompts for different metrics -DEFAULT_PROMPTS = { - # Speaker Characteristics - "speaker_count": """Analyze the audio and determine the number of distinct speakers present. -Provide your answer as a single number between 1-10. -Examples: -- For a monologue: 1 -- For an interview with host and guest: 2 -- For a panel discussion with a moderator and three panelists: 4""", - "speaker_gender": """Identify the perceived gender of the speaker(s). -If multiple speakers, list each speaker with their perceived gender. -Choose from: -- Male -- Female -- Non-binary/unclear -- Multiple speakers with mixed genders""", - "speaker_age": """Identify the age group of the speaker. -Choose exactly one label from the following categories: -- Child: under 13 years -- Teen: 13-19 years -- Young adult: 20-35 years -- Middle-aged adult: 36-55 years -- Senior: over 55 years""", - "speech_impairment": """Assess whether there are any noticeable speech impairments or disorders in the speaker's voice. -Choose exactly one category: -- No apparent impairment: typical speech patterns -- Stuttering/disfluency: repetitions, blocks, or prolongations of sounds -- Articulation disorder: difficulty with specific speech sounds -- Voice disorder: abnormal pitch, loudness, or quality -- Fluency disorder: atypical rhythm, rate, or flow of speech -- Foreign accent: non-native pronunciation patterns -- Dysarthria: slurred or unclear speech from muscle weakness -- Apraxia: difficulty with motor planning for speech -- Other impairment: speech pattern that suggests a different disorder""", - # Voice Properties - "voice_pitch": """Analyze the voice pitch/tone of the speaker. -Choose exactly one category from the following: -- Very high: significantly higher than average for their perceived gender -- High: noticeably above average pitch -- Medium: average pitch range -- Low: noticeably below average pitch -- Very low: significantly lower than average for their perceived gender""", - "pitch_range": """Assess the pitch variation/intonation range in the speaker's voice. -Choose exactly one category: -- Wide range: highly expressive with significant variation between high and low tones -- Moderate range: normal variation in pitch during speech -- Narrow range: minimal pitch variation, relatively monotone delivery -- Monotone: almost no pitch variation""", - "voice_type": """Identify the dominant voice quality-related characteristic of the speaker. -Choose exactly one category: -- Clear: clean vocal production without noticeable texture issues -- Breathy: voice has audible breath sounds, less vocal cord closure -- Creaky/vocal fry: low-frequency rattling sound, especially at ends of phrases -- Hoarse: rough, raspy quality indicating vocal strain -- Nasal: voice resonates primarily through the nose -- Pressed/tense: strained quality from excessive vocal cord pressure -- Resonant: rich, vibrant voice with good projection -- Whispered: intentionally quiet with minimal vocal cord vibration -- Tremulous: shaky or quivery voice quality""", - "speech_volume_level": """Assess the overall volume or loudness level of the speaker. -Choose exactly one category: -- Very quiet: barely audible, whispering or very soft-spoken -- Quiet: below average volume, soft-spoken -- Moderate: normal conversational volume -- Loud: above average volume, projecting voice -- Very loud: shouting or extremely high volume -- Variable: significant changes in volume throughout the recording""", - # Speech Content - "language": """Identify all languages spoken in the audio. -List languages using their English names. -Choose from common languages: -- English -- Spanish -- Mandarin Chinese -- Hindi -- Arabic -- French -- Russian -- Portuguese -- German -- Japanese -- Other (specify if possible)""", - "speech_register": """Determine the speech register used by the speaker. -Choose exactly one category: -- Formal register: careful pronunciation, complex grammar, specialized vocabulary -- Standard register: proper grammar and pronunciation for professional or educational contexts -- Consultative register: mixture of formal and casual for everyday professional interactions -- Casual register: relaxed grammar, contractions, colloquialisms for friends/family -- Intimate register: highly familiar language used with close relations -- Technical register: specialized terminology for a specific field or profession -- Slang register: highly informal with group-specific vocabulary""", - "vocabulary_complexity": """Evaluate the vocabulary complexity level in the speech. -Choose exactly one category: -- Basic: simple, everyday vocabulary, mostly high-frequency words -- General: standard vocabulary for common topics, occasional advanced words -- Advanced: sophisticated vocabulary with specific terminology -- Technical: specialized/domain-specific terminology -- Academic: scholarly vocabulary with abstract concepts""", - "speech_purpose": """Identify the primary purpose of the speech. -Choose one category: -- Informative: primarily explains or educates -- Persuasive: attempts to convince or change opinions -- Entertainment: primarily aims to amuse or entertain -- Narrative: tells a story or relates events -- Conversational: casual exchange of information -- Instructional: provides specific directions or guidance -- Emotional expression: primarily conveys feelings or emotional state""", - # Speech Delivery - "speech_emotion": """Identify the dominant emotion expressed in this speech. -Choose exactly one label from the following categories: -- Neutral: even-toned, matter-of-fact delivery with minimal emotional expression -- Happy: upbeat, positive, enthusiastic tone -- Sad: downcast, melancholic, somber tone -- Angry: irritated, frustrated, hostile tone -- Fearful: anxious, worried, frightened tone -- Surprised: astonished, shocked tone -- Disgusted: repulsed, revolted tone -- Other: other emotion that cannot be classified by above classes""", - "speech_clarity": """Rate the overall clarity and intelligibility of the speech. -Choose one category: -- High clarity: perfectly intelligible, professional quality -- Medium clarity: generally understandable with occasional unclear segments -- Low clarity: difficult to understand, frequent unclear segments -- Very low clarity: mostly unintelligible""", - "speech_rate": """Assess the rate of speech in the audio. -Choose one category: -- Very slow: deliberate, significantly slower than average speech -- Slow: relaxed pace, slower than conversational speech -- Medium: average conversational pace -- Fast: quicker than average conversational speech -- Very fast: rapid delivery, difficult to follow""", - "speaking_style": """Identify the predominant speaking style of the speaker. -Choose exactly one category: -- Formal: structured, proper, adherence to linguistic conventions -- Professional: clear, efficient communication focused on task/topic -- Casual/conversational: relaxed, everyday speech -- Animated/enthusiastic: highly energetic, expressive speech -- Deliberate: careful, measured delivery -- Dramatic: theatrical, performance-oriented speech -- Authoritative: commanding, confident tone -- Hesitant: uncertain, tentative speech with pauses""", - "laughter_crying": """Identify if there is laughter, crying, or other emotional vocalizations in the audio. -Choose exactly one category: -- No laughter or crying: speech only -- Contains laughter: audible laughter is present -- Contains crying: audible crying or sobbing is present -- Contains both: both laughter and crying are present -- Contains other emotional sounds: sighs, gasps, etc. -- Contains multiple emotional vocalizations: combination of various emotional sounds""", - # Interaction Patterns - "overlapping_speech": """Determine if there is overlapping speech in the audio (people talking simultaneously). -Choose exactly one category: -- No overlap: clean turn-taking with no simultaneous speech -- Minimal overlap: occasional brief instances of overlapping speech -- Moderate overlap: noticeable instances where speakers talk over each other -- Significant overlap: frequent overlapping speech, making it difficult to follow -- Constant overlap: multiple speakers talking simultaneously throughout most of the audio""", - # Recording Environment - "speech_background_environment": """Identify the dominant background environment or setting. -Choose one category: -- Quiet indoor: minimal background noise, likely studio environment -- Noisy indoor: indoor setting with noticeable background sounds (cafe, office) -- Outdoor urban: city sounds, traffic -- Outdoor natural: nature sounds, birds, wind, water -- Event/crowd: audience sounds, applause, crowd noise -- Music background: music playing behind speech -- Multiple environments: changes throughout recording""", - "recording_quality": """Assess the technical quality of the audio recording. -Choose one category: -- Professional: studio-quality, broadcast standard -- Good: clear recording with minimal issues -- Fair: noticeable recording artifacts but generally clear -- Poor: significant recording issues affecting comprehension -- Very poor: severe technical problems making content difficult to understand""", - "channel_type": """Identify the likely recording channel or device type used to record this audio. -Choose exactly one category: -- Professional microphone: high-quality, full-range audio -- Consumer microphone: decent quality but less clarity than professional -- Smartphone: typical mobile phone recording quality -- Telephone/VoIP: limited frequency range, compression artifacts -- Webcam/computer mic: variable quality, often with computer fan noise -- Headset microphone: close to mouth, may have breathing sounds -- Distant microphone: recorded from a distance, may have room echo -- Radio/broadcast: compressed audio with limited frequency range -- Surveillance/hidden mic: typically lower quality with background noise""", - # Vocal Evaluation - "singing_technique": """You are an expert in vocal performance and singing technique. -Given the following audio clip of a singing voice, your task is to identify the predominant singing style used. -Choose one of the following seven styles based on the vocal characteristics: - -Breathy: Light, airy voice with noticeable breathiness. -Falsetto: High-pitched, flute-like sound, especially for male voices. -Mixed Voice: A blend of chest and head voice, balanced resonance. -Pharyngeal: Focused, twangy tone with forward placement in the pharynx. -Glissando: Smooth, sliding transitions between notes. -Vibrato: Regular, pulsating pitch variation while sustaining a note. -Control: A neutral, well-supported tone without stylistic effects. - -Carefully listen to the tone quality, pitch control, resonance, and transitions in the audio. -Then, output only the predicted singing style from the list above. -""", -} - def kimi_model_setup( model_tag: str = "MoonshotAI/Kimi-Audio-7B", @@ -301,15 +101,15 @@ def kimi_model_setup( model = KimiAudio(model_path=model_tag, load_detokenizer=True) processor = model.prompt_manager sampling_params = { - "audio_temperature": 0.8, - "audio_top_k": 10, - "text_temperature": 0.0, - "text_top_k": 5, - "audio_repetition_penalty": 1.0, - "audio_repetition_window_size": 64, - "text_repetition_penalty": 1.0, - "text_repetition_window_size": 16, - } + "audio_temperature": 0.8, + "audio_top_k": 10, + "text_temperature": 0.0, + "text_top_k": 5, + "audio_repetition_penalty": 1.0, + "audio_repetition_window_size": 64, + "text_repetition_penalty": 1.0, + "text_repetition_window_size": 16, +} start_conversation = [ {"role": "assistant", "message_type": "text", "content": start_prompt}, ] @@ -326,7 +126,7 @@ def kimi_base_metric( pred_x: np.ndarray, fs: int = 16000, custom_prompt: Optional[str] = None, - max_length: int = 1000, + max_length: int = 100, ) -> str: """Calculate the base metric from Kimi-Audio results. @@ -346,35 +146,37 @@ def kimi_base_metric( conversation = copy.deepcopy(kimi_utils["start_conversation"]) sampling_params = kimi_utils["sampling_params"] - preprocessor = kimi_utils["processor"] # The preprocessor is for audio tokenization model = kimi_utils["model"] - audio = torch.from_numpy(librosa.resample( - pred_x, orig_sr=fs, target_sr=16000, # Kimi-Audio uses 16kHz as default sampling rate - )).unsqueeze(0).to(torch.float32).to(model.alm.device) - - audio_tokens = preprocessor.audio_tokenizer.tokenize(speech=audio) - audio_tokens = audio_tokens + preprocessor.kimia_token_offset - audio_tokens = audio_tokens.squeeze(0).cpu().numpy().tolist() - - conversation.extend( - [ - { - "role": "user", - "message_type": "text", - "content": custom_prompt - }, - { - "role": "user", - "message_type": "audio", - "content": audio, - "audio_tokens": audio_tokens, - } - ] - - ) + # Resample audio to 16kHz + y = librosa.resample(pred_x, orig_sr=fs, target_sr=16000) + + # Create a temporary file to satisfy the library's requirement for a file path + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + temp_path = f.name + + try: + sf.write(temp_path, y, 16000) + conversation.extend( + [ + { + "role": "user", + "message_type": "text", + "content": custom_prompt + }, + { + "role": "user", + "message_type": "audio", + "content": temp_path, + } + ] + + ) + _, response = model.generate(conversation, **sampling_params, max_new_tokens=max_length, output_type="text") + finally: + if os.path.exists(temp_path): + os.remove(temp_path) - _, response = model.generate(conversation, **sampling_params, max_new_tokens=max_length, output_type="text") return response @@ -457,6 +259,7 @@ def metric_fn( if __name__ == "__main__": a = np.random.random(16000) + kimi_utils = kimi_model_setup() all_metrics = [ kimi_speaker_count_metric, @@ -485,4 +288,3 @@ def metric_fn( for fn in all_metrics: print("metrics: {}".format(fn(kimi_utils, a, 16000))) - # print("metrics: {}".format(kimi_speech_emotion_metric(kimi_utils, a, 16000)))