Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -209,5 +209,6 @@ __marimo__/
# Local
cache/
checkpoints/
dumps/
examples/
register_db/
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM megrez:pytorch-2.8.0_cuda-12.8_python-3.12_ubuntu-22.04
FROM megrez:pytorch-2.6.0_cuda-12.6_python-3.12_ubuntu-22.04

WORKDIR /app

Expand Down
34 changes: 11 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# faster-whisper 实时语音转录系统
# funasr 实时语音转录系统

本项目是一个基于 `faster-whisper` 和 `Silero VAD` 的实时语音转录系统,支持流式音频输入和低延迟转录。系统由后端服务和前端客户端组成,可应用于会议记录、实时字幕等场景。
本项目是一个基于 `funasr` 的实时语音转录系统,支持流式音频输入和低延迟转录。系统由后端服务和前端客户端组成,可应用于会议记录、实时字幕等场景。

## 项目结构

Expand Down Expand Up @@ -33,26 +33,17 @@

> **注意**: 音频长度小于 0.4 秒时,将仅进行简单的峰值归一化以保证处理稳定性。

### 2. 语音活动检测 (VAD)
### 2. 实时转录

使用 Silero VAD 模型检测语音活动,有效过滤静音段,提升转录效率和准确性。
基于 funasr 模型实现流式转录,支持以下特性:

**配置参数** (`config.py`):
- `vad_threshold`: VAD 检测阈值 (默认 0.1)
- `min_silence_duration`: 最小静音时长 (默认 12 帧 ≈ 375ms)
- `min_voice_duration`: 最小语音时长 (默认 8 帧 ≈ 250ms)
- `silence_reserve`: 语音段前后保留的静音采样点 (默认 6 帧 ≈ 187.5ms)

### 3. 实时转录

基于 faster-whisper 模型实现流式转录,支持以下特性:
- **自动语言检测**: 支持自动识别音频语言类型
- **逆文本规范化**: 自动将数字、日期等转换为标准文本格式
- **VAD 智能合并**: 通过 `merge_vad` 和 `merge_length_s` 参数合并相邻语音片段,提升长句转录准确性
- **句子时间戳**: 提供每个句子的起止时间信息,支持精确的音频定位
- **上下文连续性**: 通过音频缓冲区管理保持流式转录的上下文连续性

- **上下文感知**: 使用上一段落文本作为 prompt 或 hotwords,提升转录连贯性
- **幻觉抑制**: 通过 `suppress_blank` 和 `repetition_penalty` 参数减少模型幻觉
- **多温度采样**: 支持 `[0.0, 0.2, 0.6, 1.0]` 温度序列,平衡生成质量和多样性
- **繁体转简体**: 可选开启繁体中文到简体中文的转换

### 4. 发言人识别
### 3. 发言人识别

基于 ModelScope 的 ERes2NetV2 模型实现发言人验证,支持多发言人场景的自动识别。

Expand Down Expand Up @@ -85,17 +76,14 @@ pip install -r requirements.txt

## 模型准备

下载 `faster-whisper` 、 `ERes2NetV2` 、 `MossFormer2_SE_48K` 和 `silero-vad` 模型到 `checkpoints/` 目录
下载 `ERes2NetV2` 模型到 `checkpoints/` 目录

```bash
cd checkpoints

modelscope download --model mobiuslabsgmbh/faster-whisper-large-v3-turbo --local_dir ./faster-whisper-large-v3-turbo
modelscope download --model iic/ClearerVoice-Studio MossFormer2_SE_48K/last_best_checkpoint --local_dir .
modelscope download --model iic/ClearerVoice-Studio MossFormer2_SE_48K/last_best_checkpoint.pt --local_dir .
modelscope download --model iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common --local_dir ./ERes2NetV2_w24s4ep4

git clone https://github.com/snakers4/silero-vad.git
```

## 运行方式
Expand Down
File renamed without changes.
66 changes: 12 additions & 54 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ class Config:

models = {
"asr": {
"name": "faster-whisper",
"path": os.path.join(model_path, "faster-whisper-large-v3-turbo"),
"compute_type": "float16",
"name": "paraformer-zh",
"device": "cuda"
},
"vad": {
"name": "silero",
"path": os.path.join(model_path, "silero-vad"),
"compute_type": "float16",
"name": "fsmn-vad",
"device": "cuda"
},
"punc": {
"name": "ct-punc-c",
"device": "cuda"
},
"speaker_verifier": {
Expand All @@ -28,63 +28,21 @@ class Config:
}
}

samplerate = 16000
preheat_audio = "./preheat_audio.wav"
max_silence_interval = 2 # 最大间隔时长,单位:秒,超过该时长则认为中断
max_speech_duration = 20 # 最大音频时长,单位:秒,超过该时长则强制结束说话人验证

dump = {
"audio_save": "none", # all: 保存所有音频,final: 只保存最终音频, none: 不保存
"audio_dir": "./cache"
"audio_save": "none", # all: 保存所有音频,final: 只保存最终音频, none: 不保存
"audio_dir": "./dumps"
}

speech_enhance = {
"enable": True,
"enable": False,
"model_name": "MossFormer2_SE_48K",
"target_lufs": -16.0,
"true_peak_limit": -1.0,
"mute_if_too_quiet": True,
"threshold_dbfs": -50,
}

vad = {
"enable": True,
"vad_threshold": 0.2,
"sampling_rate": 16000,
"sampling_per_chunk": 512,
"min_silence_duration": 12, # 12 * 31.25ms = 375ms
"min_voice_duration": 8, # 8 * 31.25ms = 250ms
"silence_reserve": 6, # 6 * 31.25ms = 187.5ms
}

filter_match = {
"enable": True,
"find_match": ["谢谢大家", "简体中文", "优独播剧场", "大家好,这是一段会议录音。"],
"cos_match": [
"请不吝点赞 订阅 转发 打赏支持明镜与点栏目",
"志愿者 李宗盛",
"大家好,这是一段会议录音。",
"字幕志愿者 杨栋梁",
"明镜需要您的支持 欢迎订阅明镜",
"优优独播剧场——YoYo Television Series Exclusive",
"中文字幕——Yo Television Series Exclusive"
],
"cos_sim": 0.02
}

whisper_config = {
"tradition_to_simple": False,
"interruption_duration": 20, # 最大中断时长,单位:秒
"beam_size": 8, # 1、beam_size调整为8 best_of调整为4 提高模型效果
"best_of": 4, # 2、beam_size调整为4 best_of调整为1 速度更快
"patience": 1.0,
"suppress_blank": True, # 幻觉抑制
"repetition_penalty": 1.2, # 重复惩罚 但降低效果
"log_prob_threshold": -1.0,
"no_speech_threshold": 0.8,
"condition_on_previous_text": True,
"previous_text_prompt": False,
"previous_text_hotwords": True, # 把上段语句做为提示 断句相对更保守 以提升效果
"previous_text_prefix": False,
"initial_prompt": "大家好,这是一段会议录音。",
"hotwords_text": "",
"temperature": [0.0, 0.2, 0.6, 1.0],
"avg_logprob_score": -1.0 # 设置过滤阈值 低于阈值则不输出
}
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
services:
api:
image: transcriptor:latest
runtime: nvidia
deploy:
resources:
reservations:
Expand All @@ -12,8 +11,9 @@ services:
environment:
- PYTHONUNBUFFERED=1
volumes:
- ./cache:/app/cache
- ./cache:/root/.cache/modelscope
- ./checkpoints:/app/checkpoints
- ./dumps:/app/dumps
- ./examples:/app/examples
- ./register_db:/app/register_db
- ./config.py:/app/config.py
Expand Down
Empty file added dumps/dump_save_here
Empty file.
19 changes: 7 additions & 12 deletions requirements-server.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
faster-whisper==1.2.0
librosa==0.10.2.post1
OpenCC==1.1.9
opuslib_next==1.1.5
scikit-learn==1.7.2
websockets==14.1
modelscope==1.32.0
modelscope[framework]==1.32.0
funasr==1.2.7
torch==2.6.0+cu126
torchaudio==2.6.0+cu126
pydub==0.25.1
modelscope==1.31.0
addict==2.4.0
datasets==3.6.0
pillow==12.0.0
simplejson==3.20.2
sortedcontainers==2.4.0
pyloudnorm==0.1.1
clearvoice==0.1.2
websockets==14.1
opuslib_next==1.1.5
25 changes: 9 additions & 16 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
faster-whisper==1.2.0
librosa==0.10.2.post1
OpenCC==1.1.9
opuslib_next==1.1.5
scikit-learn==1.7.2
torch==2.9.0
torchaudio==2.9.0
websocket-client==1.9.0
websockets==14.1
modelscope==1.32.0
modelscope[framework]==1.32.0
funasr==1.2.7
torch==2.6.0
torchaudio==2.6.0
pydub==0.25.1
pyaudio==0.2.14
modelscope==1.31.0
addict==2.4.0
datasets==3.6.0
pillow==12.0.0
simplejson==3.20.2
sortedcontainers==2.4.0
pyloudnorm==0.1.1
clearvoice==0.1.2
websockets==14.1
opuslib_next==1.1.5
websocket-client==1.9.0
pyaudio==0.2.14
Loading