MingTok-Audio/example_usage.py at main · ABC0408/MingTok-Audio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""
语音转换系统使用示例

这个脚本展示了如何使用提供的工具来：
1. 验证semantic特征是否包含说话人信息
2. 使用语音转换模型
"""

import torch
import torchaudio
from audio_tokenizer.modeling_audio_vae import AudioVAE


def example_1_extract_features():
    """
    示例1: 使用MingTok-Audio提取音频特征
    """
    print("=" * 80)
    print("示例1: 提取音频特征")
    print("=" * 80)

    # 加载MingTok-Audio模型
    print("\n加载MingTok-Audio模型...")
    model = AudioVAE.from_pretrained('inclusionAI/MingTok-Audio')
    model = model.cuda()
    model.eval()

    # 加载音频
    print("加载音频文件...")
    waveform, sr = torchaudio.load('data/1089-134686-0000.flac', backend='soundfile')

    # 准备输入
    sample = {
        'waveform': waveform.cuda(),
        'waveform_length': torch.tensor([waveform.size(-1)]).cuda()
    }

    # 提取特征
    print("\n提取特征...")
    with torch.no_grad():
        with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
            # 1. 提取acoustic latent
            latent, frame_num = model.encode_latent(**sample)
            print(f"✓ Latent特征: {latent.shape}")

            # 2. 提取unified embedding (semantic特征)
            unified_emb, _ = model.encode_unified_emb_from_latent(latent)
            print(f"✓ Unified Embedding特征: {unified_emb.shape}")

            # 3. 从latent重建音频
            output_waveform = model.decode(latent)
            print(f"✓ 重建音频: {output_waveform.shape}")

    # 保存重建的音频
    torchaudio.save(
        './reconstructed_audio.wav',
        output_waveform.cpu()[0],
        sample_rate=16000
    )
    print("\n✓ 重建音频已保存到: reconstructed_audio.wav")

    print("\n" + "=" * 80)
    print("说明:")
    print("- latent: 低维声学表示，包含重建音频所需的所有信息")
    print("- unified_emb: 通过Whisper编码器处理后的高维特征")
    print("- 这两种特征都可能包含说话人信息，需要实验验证")
    print("=" * 80)


def example_2_verify_speaker_info():
    """
    示例2: 验证特征是否包含说话人信息

    注意: 这个示例需要准备多个说话人的音频数据
    """
    print("\n\n" + "=" * 80)
    print("示例2: 验证特征是否包含说话人信息")
    print("=" * 80)

    print("""
要运行完整的验证实验，请：

1. 准备数据集（至少3-5个说话人，每人3-5段音频）:

from audio_tokenizer.modeling_audio_vae import AudioVAE
from verify_speaker_info import SpeakerInfoAnalyzer

# 加载模型
model = AudioVAE.from_pretrained('inclusionAI/MingTok-Audio')
model = model.cuda()

# 创建分析器
analyzer = SpeakerInfoAnalyzer(model, device='cuda')

# 准备音频文件路径
audio_files = {
    'speaker_1': [
        'path/to/speaker1_audio1.flac',
        'path/to/speaker1_audio2.flac',
        'path/to/speaker1_audio3.flac',
    ],
    'speaker_2': [
        'path/to/speaker2_audio1.flac',
        'path/to/speaker2_audio2.flac',
        'path/to/speaker2_audio3.flac',
    ],
    'speaker_3': [
        'path/to/speaker3_audio1.flac',
        'path/to/speaker3_audio2.flac',
        'path/to/speaker3_audio3.flac',
    ],
}

# 提取特征
analyzer.extract_features(audio_files)

# 实验1: 训练说话人分类器
print("\\n实验1: 训练说话人分类器")
acc_latent = analyzer.train_speaker_classifier(feature_type='latent', num_epochs=50)
acc_unified = analyzer.train_speaker_classifier(feature_type='unified_emb', num_epochs=50)

# 实验2: 相似度分析
print("\\n实验2: 相似度分析")
sim_latent = analyzer.compute_similarity_analysis(feature_type='latent')
sim_unified = analyzer.compute_similarity_analysis(feature_type='unified_emb')

# 实验3: 特征空间可视化
print("\\n实验3: 特征可视化")
analyzer.visualize_features(
    feature_type='latent',
    method='tsne',
    save_path='latent_tsne.png'
)
analyzer.visualize_features(
    feature_type='unified_emb',
    method='tsne',
    save_path='unified_emb_tsne.png'
)

print("\\n实验结果解读:")
print(f"Latent分类准确率: {acc_latent:.2f}%")
print(f"Unified EMB分类准确率: {acc_unified:.2f}%")
print(f"随机猜测准确率: {100./len(audio_files):.2f}%")

if acc_latent > 100./len(audio_files) * 2:
    print("\\n✓ 结论: Latent特征包含显著的说话人信息！")

if acc_unified > 100./len(audio_files) * 2:
    print("✓ 结论: Unified EMB特征也包含显著的说话人信息！")

2. 推荐的数据集：
   - LibriSpeech (英文)
   - VCTK (英文, 110个说话人)
   - AIShell (中文)
   - Common Voice (多语言)
""")


def example_3_voice_conversion():
    """
    示例3: 使用语音转换模型

    注意: 这需要先训练语音转换模型
    """
    print("\n\n" + "=" * 80)
    print("示例3: 语音转换模型使用")
    print("=" * 80)

    print("""
使用语音转换模型的步骤：

1. 创建模型:

from audio_tokenizer.modeling_audio_vae import AudioVAE
from voice_conversion_model import VoiceConversionModel, VoiceConversionLoss

# 加载MingTok-Audio
mingtok_model = AudioVAE.from_pretrained('inclusionAI/MingTok-Audio')
mingtok_model = mingtok_model.cuda()

# 创建语音转换模型
vc_model = VoiceConversionModel(
    mingtok_model,
    freeze_tokenizer=True
).cuda()

2. 训练模型:

import torch.optim as optim

# 准备数据加载器（需要源音频、目标音频、目标mel-spectrogram）
# train_loader = ...

loss_fn = VoiceConversionLoss()
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, vc_model.parameters()),
    lr=1e-4
)

for epoch in range(num_epochs):
    for batch in train_loader:
        src_waveform = batch['source_audio'].cuda()
        tgt_mel_spec = batch['target_mel'].cuda()
        tgt_waveform = batch['target_audio'].cuda()

        # 前向传播
        converted_waveform, intermediates = vc_model(
            src_waveform,
            tgt_mel_spec,
            return_intermediate=True
        )

        # 计算说话人embedding
        converted_mel = extract_mel(converted_waveform)
        converted_speaker_emb = vc_model.speaker_encoder(converted_mel)
        target_speaker_emb = vc_model.speaker_encoder(tgt_mel_spec)

        # 计算损失
        losses = loss_fn(
            converted_waveform=converted_waveform,
            target_waveform=tgt_waveform,
            converted_speaker_emb=converted_speaker_emb,
            target_speaker_emb=target_speaker_emb,
            src_latent=intermediates['src_latent'],
            converted_latent=intermediates['converted_latent']
        )

        # 反向传播
        optimizer.zero_grad()
        losses['total_loss'].backward()
        optimizer.step()

3. 推理（语音转换）:

# 加载训练好的模型
vc_model.load_state_dict(torch.load('vc_model.pt'))
vc_model.eval()

# 加载音频
src_waveform, _ = torchaudio.load('source_audio.wav')
tgt_waveform, _ = torchaudio.load('target_speaker_reference.wav')

# 提取目标说话人的mel-spectrogram
tgt_mel = extract_mel_spectrogram(tgt_waveform)

# 执行转换
with torch.no_grad():
    converted = vc_model.convert(
        src_waveform.cuda(),
        tgt_mel.cuda()
    )

# 保存结果
torchaudio.save('converted_output.wav', converted.cpu(), sample_rate=16000)
""")


def main():
    """
    主函数：运行各个示例
    """
    print("""
╔════════════════════════════════════════════════════════════════════════════╗
║                    语音转换系统使用示例                                    ║
╚════════════════════════════════════════════════════════════════════════════╝

本脚本包含3个示例：

1. 提取音频特征 - 演示如何使用MingTok-Audio提取各种特征
2. 验证说话人信息 - 通过实验验证特征是否包含说话人信息
3. 语音转换模型 - 演示如何训练和使用语音转换模型

选项:
  --example1  : 运行示例1（提取特征）
  --example2  : 显示示例2的说明（需要准备多说话人数据）
  --example3  : 显示示例3的说明（需要训练模型）
  --all       : 显示所有示例

默认: 运行示例1
""")

    import sys

    if len(sys.argv) == 1 or '--example1' in sys.argv or '--all' in sys.argv:
        # 尝试运行示例1
        try:
            example_1_extract_features()
        except Exception as e:
            print(f"\n示例1运行出错: {e}")
            print("请确保已安装所有依赖并下载了模型")

    if '--example2' in sys.argv or '--all' in sys.argv:
        example_2_verify_speaker_info()

    if '--example3' in sys.argv or '--all' in sys.argv:
        example_3_voice_conversion()

    print("\n\n" + "=" * 80)
    print("更多信息请查看:")
    print("  - voice_conversion_design.md: 理论分析和设计方案")
    print("  - VOICE_CONVERSION_README.md: 完整使用指南")
    print("  - verify_speaker_info.py: 验证实验工具")
    print("  - voice_conversion_model.py: 语音转换模型实现")
    print("=" * 80)


if __name__ == '__main__':
    main()