Zerone-Agent · Zerone-Agent · Dec 8, 2025 · Dec 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -209,5 +209,6 @@ __marimo__/
 # Local
 cache/
 checkpoints/
+dumps/
 examples/
 register_db/
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM megrez:pytorch-2.8.0_cuda-12.8_python-3.12_ubuntu-22.04
+FROM megrez:pytorch-2.6.0_cuda-12.6_python-3.12_ubuntu-22.04
 
 WORKDIR /app
 

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# faster-whisper 实时语音转录系统
+# funasr 实时语音转录系统
 
-本项目是一个基于 `faster-whisper` 和 `Silero VAD` 的实时语音转录系统，支持流式音频输入和低延迟转录。系统由后端服务和前端客户端组成，可应用于会议记录、实时字幕等场景。
+本项目是一个基于 `funasr` 的实时语音转录系统，支持流式音频输入和低延迟转录。系统由后端服务和前端客户端组成，可应用于会议记录、实时字幕等场景。
 
 ## 项目结构
 
@@ -33,26 +33,17 @@
 
 > **注意**: 音频长度小于 0.4 秒时，将仅进行简单的峰值归一化以保证处理稳定性。
 
-### 2. 语音活动检测 (VAD)
+### 2. 实时转录
 
-使用 Silero VAD 模型检测语音活动，有效过滤静音段，提升转录效率和准确性。
+基于 funasr 模型实现流式转录，支持以下特性：
 
-**配置参数** (`config.py`):
-- `vad_threshold`: VAD 检测阈值 (默认 0.1)
-- `min_silence_duration`: 最小静音时长 (默认 12 帧 ≈ 375ms)
-- `min_voice_duration`: 最小语音时长 (默认 8 帧 ≈ 250ms)
-- `silence_reserve`: 语音段前后保留的静音采样点 (默认 6 帧 ≈ 187.5ms)
-
-### 3. 实时转录
-
-基于 faster-whisper 模型实现流式转录，支持以下特性：
+- **自动语言检测**: 支持自动识别音频语言类型
+- **逆文本规范化**: 自动将数字、日期等转换为标准文本格式
+- **VAD 智能合并**: 通过 `merge_vad` 和 `merge_length_s` 参数合并相邻语音片段，提升长句转录准确性
+- **句子时间戳**: 提供每个句子的起止时间信息，支持精确的音频定位
+- **上下文连续性**: 通过音频缓冲区管理保持流式转录的上下文连续性
 
-- **上下文感知**: 使用上一段落文本作为 prompt 或 hotwords，提升转录连贯性
-- **幻觉抑制**: 通过 `suppress_blank` 和 `repetition_penalty` 参数减少模型幻觉
-- **多温度采样**: 支持 `[0.0, 0.2, 0.6, 1.0]` 温度序列，平衡生成质量和多样性
-- **繁体转简体**: 可选开启繁体中文到简体中文的转换
-
-### 4. 发言人识别
+### 3. 发言人识别
 
 基于 ModelScope 的 ERes2NetV2 模型实现发言人验证，支持多发言人场景的自动识别。
 
@@ -85,17 +76,14 @@ pip install -r requirements.txt
 
 ## 模型准备
 
-下载 `faster-whisper` 、 `ERes2NetV2` 、 `MossFormer2_SE_48K` 和 `silero-vad` 模型到 `checkpoints/` 目录
+下载 `ERes2NetV2` 模型到 `checkpoints/` 目录
 
 ```bash
 cd checkpoints
 
-modelscope download --model mobiuslabsgmbh/faster-whisper-large-v3-turbo --local_dir ./faster-whisper-large-v3-turbo
 modelscope download --model iic/ClearerVoice-Studio MossFormer2_SE_48K/last_best_checkpoint --local_dir .
 modelscope download --model iic/ClearerVoice-Studio MossFormer2_SE_48K/last_best_checkpoint.pt --local_dir .
 modelscope download --model iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common --local_dir ./ERes2NetV2_w24s4ep4
-
-git clone https://github.com/snakers4/silero-vad.git
 ```
 
 ## 运行方式

diff --git a/cache/dump_save_here → cache/modelscope_cache_here b/cache/dump_save_here → cache/modelscope_cache_here
diff --git a/config.py b/config.py
@@ -6,15 +6,15 @@ class Config:
 
     models = {
         "asr": {
-            "name": "faster-whisper",
-            "path": os.path.join(model_path, "faster-whisper-large-v3-turbo"),
-            "compute_type": "float16",
+            "name": "paraformer-zh",
             "device": "cuda"
         },
         "vad": {
-            "name": "silero",
-            "path": os.path.join(model_path, "silero-vad"),
-            "compute_type": "float16",
+            "name": "fsmn-vad",
+            "device": "cuda"
+        },
+        "punc": {
+            "name": "ct-punc-c",
             "device": "cuda"
         },
         "speaker_verifier": {
@@ -28,63 +28,21 @@ class Config:
         }
     }
 
+    samplerate = 16000
     preheat_audio = "./preheat_audio.wav"
+    max_silence_interval = 2    # 最大间隔时长，单位：秒，超过该时长则认为中断
+    max_speech_duration = 20    # 最大音频时长，单位：秒，超过该时长则强制结束说话人验证
 
     dump = {
-        "audio_save": "none",  # all: 保存所有音频，final: 只保存最终音频, none: 不保存
-        "audio_dir": "./cache"
+        "audio_save": "none",   # all: 保存所有音频，final: 只保存最终音频, none: 不保存
+        "audio_dir": "./dumps"
     }
 
     speech_enhance = {
-        "enable": True,
+        "enable": False,
         "model_name": "MossFormer2_SE_48K",
         "target_lufs": -16.0,
         "true_peak_limit": -1.0,
         "mute_if_too_quiet": True,
         "threshold_dbfs": -50,
     }
-
-    vad = {
-        "enable": True,
-        "vad_threshold": 0.2,
-        "sampling_rate": 16000,
-        "sampling_per_chunk": 512,
-        "min_silence_duration": 12,        # 12 * 31.25ms = 375ms
-        "min_voice_duration": 8,           # 8 * 31.25ms = 250ms
-        "silence_reserve": 6,              # 6 * 31.25ms = 187.5ms
-    }
-
-    filter_match = {
-        "enable": True,
-        "find_match": ["谢谢大家", "简体中文", "优独播剧场", "大家好，这是一段会议录音。"],
-        "cos_match": [
-            "请不吝点赞 订阅 转发 打赏支持明镜与点栏目",
-            "志愿者 李宗盛",
-            "大家好，这是一段会议录音。",
-            "字幕志愿者 杨栋梁",
-            "明镜需要您的支持 欢迎订阅明镜",
-            "优优独播剧场——YoYo Television Series Exclusive",
-            "中文字幕——Yo Television Series Exclusive"
-        ],
-        "cos_sim": 0.02
-    }
-
-    whisper_config = {
-        "tradition_to_simple": False,
-        "interruption_duration": 20,    # 最大中断时长，单位：秒
-        "beam_size": 8,  # 1、beam_size调整为8 best_of调整为4 提高模型效果
-        "best_of": 4,    # 2、beam_size调整为4 best_of调整为1 速度更快
-        "patience": 1.0,
-        "suppress_blank": True,     # 幻觉抑制
-        "repetition_penalty": 1.2,  # 重复惩罚 但降低效果
-        "log_prob_threshold": -1.0,
-        "no_speech_threshold": 0.8,
-        "condition_on_previous_text": True,
-        "previous_text_prompt": False,
-        "previous_text_hotwords": True, # 把上段语句做为提示 断句相对更保守 以提升效果
-        "previous_text_prefix": False,
-        "initial_prompt": "大家好，这是一段会议录音。",
-        "hotwords_text": "",
-        "temperature": [0.0, 0.2, 0.6, 1.0],
-        "avg_logprob_score": -1.0  # 设置过滤阈值 低于阈值则不输出
-    }
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,7 +1,6 @@
 services:
   api:
     image: transcriptor:latest
-    runtime: nvidia
     deploy:
       resources:
         reservations:
@@ -12,8 +11,9 @@ services:
     environment:
       - PYTHONUNBUFFERED=1
     volumes:
-      - ./cache:/app/cache
+      - ./cache:/root/.cache/modelscope
       - ./checkpoints:/app/checkpoints
+      - ./dumps:/app/dumps
       - ./examples:/app/examples
       - ./register_db:/app/register_db
       - ./config.py:/app/config.py

diff --git a/dumps/dump_save_here b/dumps/dump_save_here
diff --git a/requirements-server.txt b/requirements-server.txt
@@ -1,15 +1,10 @@
-faster-whisper==1.2.0
-librosa==0.10.2.post1
-OpenCC==1.1.9
-opuslib_next==1.1.5
-scikit-learn==1.7.2
-websockets==14.1
+modelscope==1.32.0
+modelscope[framework]==1.32.0
+funasr==1.2.7
+torch==2.6.0+cu126
+torchaudio==2.6.0+cu126
 pydub==0.25.1
-modelscope==1.31.0
-addict==2.4.0
-datasets==3.6.0
-pillow==12.0.0
-simplejson==3.20.2
-sortedcontainers==2.4.0
 pyloudnorm==0.1.1
 clearvoice==0.1.2
+websockets==14.1
+opuslib_next==1.1.5
diff --git a/requirements.txt b/requirements.txt
@@ -1,19 +1,12 @@
-faster-whisper==1.2.0
-librosa==0.10.2.post1
-OpenCC==1.1.9
-opuslib_next==1.1.5
-scikit-learn==1.7.2
-torch==2.9.0
-torchaudio==2.9.0
-websocket-client==1.9.0
-websockets==14.1
+modelscope==1.32.0
+modelscope[framework]==1.32.0
+funasr==1.2.7
+torch==2.6.0
+torchaudio==2.6.0
 pydub==0.25.1
-pyaudio==0.2.14
-modelscope==1.31.0
-addict==2.4.0
-datasets==3.6.0
-pillow==12.0.0
-simplejson==3.20.2
-sortedcontainers==2.4.0
 pyloudnorm==0.1.1
 clearvoice==0.1.2
+websockets==14.1
+opuslib_next==1.1.5
+websocket-client==1.9.0
+pyaudio==0.2.14