From d1d91a36c03734ebedc35e8f4a90e70058d8dfbf Mon Sep 17 00:00:00 2001
From: yxz <3978401510@qq.com@example.com>
Date: Thu, 9 Apr 2026 22:12:03 +0800
Subject: [PATCH] =?UTF-8?q?feat(video):=20=E5=8F=A3=E6=92=AD=20ASR?=
 =?UTF-8?q?=E3=80=81=E6=AD=A3=E6=96=87=E5=90=88=E5=B9=B6=E4=B8=8E=E5=BF=AB?=
 =?UTF-8?q?=E8=AF=86=E8=B6=85=E6=97=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增 video_stt：硅基/OpenAI 兼容转写，httpx 直连与超时，响应解析与日志
- 视频快识：剔除旁白行后再合并 STT；标题正文同短句时仍抽帧 OCR；专向听写不足判定与单测
- 前端 quickRecognizeVideo 超时延长至 10 分钟以覆盖长视频 STT
- .env.example 补充 TeleSpeechASR 与转写接口说明

Made-with: Cursor
---
 .env.example                           |  28 ++
 backend/app/analysis/video_stt.py      | 401 +++++++++++++++++++++++++
 backend/app/api/screenshot_api.py      | 124 +++++++-
 backend/tests/test_video_text_merge.py |  32 ++
 frontend/src/utils/api.ts              |   6 +-
 5 files changed, 584 insertions(+), 7 deletions(-)
 create mode 100644 backend/app/analysis/video_stt.py
 create mode 100644 backend/tests/test_video_text_merge.py

diff --git a/.env.example b/.env.example
index 1def101..41c2fc9 100644
--- a/.env.example
+++ b/.env.example
@@ -90,3 +90,31 @@ TEMP_VIDEO_TTL_SECONDS=900
 
 # 可选：临时视频落盘目录
 # TEMP_VIDEO_DIR=backend/data/temp_videos
+
+# === 视频口播转写（OpenAI 兼容 /audio/transcriptions，非 TTS）===
+# 需本机安装 ffmpeg。只要填了 OPENAI_WHISPER_BASE_URL，就必须填 OPENAI_WHISPER_API_KEY（勿用 MiMo Key）。
+# 默认开启（代码默认值为 1）；可显式关闭：
+# VIDEO_STT_ENABLED=0
+# VIDEO_STT_ENABLED=1
+#
+# OpenAI 官方 Whisper：
+# OPENAI_WHISPER_BASE_URL=https://api.openai.com/v1
+# OPENAI_WHISPER_API_KEY=sk-...
+# WHISPER_MODEL=whisper-1
+#
+# 硅基流动 ASR（创建语音转文本，勿与 TTS「上传参考音频」upload-voice 混淆）：
+# 文档：https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
+# upload-voice 文档：https://docs.siliconflow.cn/cn/api-reference/audio/upload-voice （仅 TTS 音色，不用于 ASR）
+# OPENAI_WHISPER_BASE_URL=https://api.siliconflow.cn/v1
+# OPENAI_WHISPER_API_KEY=<硅基控制台 API Key>
+# WHISPER_MODEL=TeleAI/TeleSpeechASR
+# WHISPER_MODEL=FunAudioLLM/SenseVoiceSmall
+# 硅基部分模型可不传 language：VIDEO_STT_LANGUAGE= （空）
+#
+# VIDEO_STT_MAX_AUDIO_SECONDS=600
+# 长视频分段转写（秒），默认 480；调小更稳但请求次数更多
+# VIDEO_STT_SEGMENT_SECONDS=480
+# VIDEO_STT_TIMEOUT_SEC=240
+# VIDEO_STT_LANGUAGE=zh
+# 优先请求 verbose_json（带 segments）；网关不支持会自动回退
+# VIDEO_STT_PREFER_VERBOSE_JSON=1
diff --git a/backend/app/analysis/video_stt.py b/backend/app/analysis/video_stt.py
new file mode 100644
index 0000000..8c5d39e
--- /dev/null
+++ b/backend/app/analysis/video_stt.py
@@ -0,0 +1,401 @@
+"""
+视频口播转写（Speech-to-Text）：从音轨提取 WAV 后调用 **OpenAI 兼容** 的 `/v1/audio/transcriptions`。
+
+说明：用户口语中的「TTS」常混用；此处为 **ASR（语音→文字）**，不是文字转语音。
+
+兼容：
+- OpenAI 官方：`OPENAI_WHISPER_BASE_URL=https://api.openai.com/v1`，`WHISPER_MODEL=whisper-1`
+- 硅基流动 ASR：`OPENAI_WHISPER_BASE_URL=https://api.siliconflow.cn/v1`，`WHISPER_MODEL` 如 `TeleAI/TeleSpeechASR`、`FunAudioLLM/SenseVoiceSmall`（走「创建语音转文本」/audio/transcriptions，非 TTS 的 upload-voice）
+
+依赖：
+- 系统 PATH 中可用 `ffmpeg`（提取音轨）
+- 配置 `OPENAI_WHISPER_API_KEY` + `OPENAI_WHISPER_BASE_URL`（勿用 MiMo 对话 Key 冒充第三方 ASR）
+"""
+from __future__ import annotations
+
+import asyncio
+import io
+import logging
+import os
+import subprocess
+import tempfile
+from typing import Optional
+
+logger = logging.getLogger("noterx.video_stt")
+
+
+def _stt_enabled() -> bool:
+    # 默认开启：只要具备可用 ASR 配置就执行；可通过 VIDEO_STT_ENABLED=0 显式关闭
+    v = os.getenv("VIDEO_STT_ENABLED", "1").strip().lower()
+    return v in ("1", "true", "yes", "on")
+
+
+def _resolve_whisper_client_config() -> tuple[Optional[str], Optional[str]]:
+    """
+    @returns (api_key, base_url) 若不可用则 (None, None)
+    """
+    whisper_key = (os.getenv("OPENAI_WHISPER_API_KEY") or "").strip()
+    explicit = (os.getenv("OPENAI_WHISPER_BASE_URL") or "").strip().rstrip("/")
+
+    if explicit:
+        # 任意第三方 / 官方 ASR 基址：必须用专用 Key，禁止回退到 MiMo 的 OPENAI_API_KEY
+        if not whisper_key:
+            logger.info(
+                "VIDEO_STT: 已设置 OPENAI_WHISPER_BASE_URL，请在 .env 填写 OPENAI_WHISPER_API_KEY",
+            )
+            return None, None
+        return whisper_key, explicit
+
+    key = whisper_key or (os.getenv("OPENAI_API_KEY") or "").strip()
+    if not key:
+        return None, None
+
+    main = (os.getenv("OPENAI_BASE_URL") or "").strip().lower()
+    if "xiaomimimo" in main or "mimo-v2.com" in main:
+        logger.info(
+            "VIDEO_STT: 已配置 MiMo 为 OPENAI_BASE_URL；请设置 OPENAI_WHISPER_BASE_URL 与 OPENAI_WHISPER_API_KEY，"
+            "例如官方 https://api.openai.com/v1 或硅基流动 https://api.siliconflow.cn/v1",
+        )
+        return None, None
+
+    base = (os.getenv("OPENAI_BASE_URL") or "https://api.openai.com/v1").strip().rstrip("/")
+    return key, base
+
+
+def _probe_video_duration_seconds(video_path: str) -> Optional[float]:
+    """用 ffprobe 获取视频时长（秒）。"""
+    try:
+        proc = subprocess.run(
+            [
+                "ffprobe",
+                "-v",
+                "error",
+                "-show_entries",
+                "format=duration",
+                "-of",
+                "default=noprint_wrappers=1:nokey=1",
+                video_path,
+            ],
+            capture_output=True,
+            timeout=20,
+            check=False,
+        )
+        if proc.returncode != 0:
+            return None
+        raw = (proc.stdout or b"").decode("utf-8", errors="replace").strip()
+        if not raw:
+            return None
+        dur = float(raw)
+        if dur <= 0:
+            return None
+        return dur
+    except Exception:
+        return None
+
+
+def _extract_wav_segment(video_path: str, *, start_sec: float, clip_sec: float) -> bytes:
+    """
+    从视频中提取一个音频片段为 16kHz 单声道 WAV。
+    @returns 空 bytes 表示失败
+    """
+    fd, wav_path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)
+    try:
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-ss",
+            f"{max(start_sec, 0):.3f}",
+            "-i",
+            video_path,
+            "-vn",
+            "-acodec",
+            "pcm_s16le",
+            "-ar",
+            "16000",
+            "-ac",
+            "1",
+            "-t",
+            f"{max(clip_sec, 1.0):.3f}",
+            wav_path,
+        ]
+        proc = subprocess.run(
+            cmd,
+            capture_output=True,
+            timeout=max(120, int(clip_sec * 1.5)),
+            check=False,
+        )
+        if proc.returncode != 0:
+            err = (proc.stderr or b"").decode("utf-8", errors="replace")[:400]
+            logger.warning("VIDEO_STT: ffmpeg 片段提取失败 rc=%s %s", proc.returncode, err)
+            return b""
+        with open(wav_path, "rb") as wf:
+            out = wf.read()
+        return out
+    except FileNotFoundError:
+        logger.warning("VIDEO_STT: 未找到 ffmpeg，请安装后加入 PATH")
+        return b""
+    except subprocess.TimeoutExpired:
+        logger.warning("VIDEO_STT: ffmpeg 片段提取超时")
+        return b""
+    except Exception as e:
+        logger.warning("VIDEO_STT: 片段提取异常 %s", e)
+        return b""
+    finally:
+        if wav_path and os.path.exists(wav_path):
+            try:
+                os.remove(wav_path)
+            except OSError:
+                pass
+
+
+def _extract_wav_chunks_from_video_bytes(video_bytes: bytes, container_suffix: str) -> list[bytes]:
+    """
+    用 ffmpeg 将视频音轨切分为多个 16kHz 单声道 WAV 片段（Whisper 友好）。
+    默认按分段转写，避免单文件过大导致接口拒收。
+    @returns WAV 片段列表；空列表表示失败（无 ffmpeg / 无音轨）
+    """
+    suffix = container_suffix if container_suffix.startswith(".") else f".{container_suffix}"
+    max_total_sec = int(os.getenv("VIDEO_STT_MAX_AUDIO_SECONDS", "3600"))
+    max_total_sec = max(60, min(max_total_sec, 14400))
+    seg_sec = int(os.getenv("VIDEO_STT_SEGMENT_SECONDS", "480"))
+    seg_sec = max(30, min(seg_sec, 1200))
+
+    video_path = ""
+    chunks: list[bytes] = []
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as vf:
+            vf.write(video_bytes)
+            video_path = vf.name
+
+        duration = _probe_video_duration_seconds(video_path)
+        if duration is None:
+            # 无法探测时按上限兜底，至少尝试一次
+            duration = float(max_total_sec)
+        target_total = min(duration, float(max_total_sec))
+        if duration > max_total_sec:
+            logger.info(
+                "VIDEO_STT: 视频时长 %.1fs 超过上限 %ss，超出部分不转写",
+                duration,
+                max_total_sec,
+            )
+
+        seg_count = max(1, int((target_total + seg_sec - 1) // seg_sec))
+        for idx in range(seg_count):
+            start_sec = idx * seg_sec
+            remain = target_total - start_sec
+            if remain <= 0:
+                break
+            clip_sec = min(seg_sec, remain)
+            wav = _extract_wav_segment(video_path, start_sec=start_sec, clip_sec=clip_sec)
+            if not wav:
+                continue
+            if len(wav) > 24 * 1024 * 1024:
+                logger.warning(
+                    "VIDEO_STT: 片段 %s WAV 超过 24MB，建议调小 VIDEO_STT_SEGMENT_SECONDS",
+                    idx + 1,
+                )
+            chunks.append(wav)
+        return chunks
+    except Exception as e:
+        logger.warning("VIDEO_STT: 提取音轨异常 %s", e)
+        return []
+    finally:
+        if video_path and os.path.exists(video_path):
+            try:
+                os.remove(video_path)
+            except OSError:
+                pass
+
+
+def _join_transcript_parts(parts: list[str]) -> str:
+    """拼接多段转写，做轻量去重。"""
+    out: list[str] = []
+    for part in parts:
+        text = (part or "").strip()
+        if not text:
+            continue
+        if not out:
+            out.append(text)
+            continue
+        prev = out[-1]
+        if text in prev:
+            continue
+        if prev in text:
+            out[-1] = text
+            continue
+        out.append(text)
+    return "\n".join(out).strip()
+
+
+async def _transcribe_single_wav(
+    client,
+    *,
+    model: str,
+    wav: bytes,
+    timeout_sec: float,
+    language: Optional[str],
+) -> str:
+    """
+    转写单个 wav 片段。优先尝试 verbose_json（可拿更完整分段），不支持则自动回退。
+    """
+    base_kwargs: dict = {
+        "model": model,
+        "timeout": timeout_sec,
+    }
+    if language:
+        base_kwargs["language"] = language
+
+    prefer_verbose = os.getenv("VIDEO_STT_PREFER_VERBOSE_JSON", "1").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+        "on",
+    )
+
+    if prefer_verbose:
+        try:
+            buf = io.BytesIO(wav)
+            buf.name = "audio.wav"
+            resp = await client.audio.transcriptions.create(
+                **base_kwargs,
+                file=buf,
+                response_format="verbose_json",
+            )
+            text = _transcription_text_from_response(resp)
+            if text:
+                return text
+        except Exception as e:
+            logger.info("VIDEO_STT: verbose_json 不可用，回退默认格式: %s", e)
+
+    buf = io.BytesIO(wav)
+    buf.name = "audio.wav"
+    resp = await client.audio.transcriptions.create(**base_kwargs, file=buf)
+    return _transcription_text_from_response(resp)
+
+
+async def transcribe_video_with_whisper(video_bytes: bytes, container_suffix: str) -> str:
+    """
+    异步：线程池提取 WAV + AsyncOpenAI Whisper 转写。
+    @returns 转写文本，失败或未开启时为空字符串
+    """
+    if not _stt_enabled():
+        return ""
+
+    key, base = _resolve_whisper_client_config()
+    if not key or not base:
+        return ""
+
+    model = (os.getenv("WHISPER_MODEL") or "whisper-1").strip()
+
+    chunks = await asyncio.to_thread(_extract_wav_chunks_from_video_bytes, video_bytes, container_suffix)
+    if not chunks:
+        logger.warning(
+            "VIDEO_STT: WAV 片段为空，未调用转写 API（检查 ffmpeg、视频是否含音轨、上方 stderr 日志）",
+        )
+        return ""
+
+    import httpx
+    from openai import AsyncOpenAI
+
+    _stt_http_timeout = float(os.getenv("VIDEO_STT_TIMEOUT_SEC", "240"))
+    _stt_http_timeout = max(60.0, min(_stt_http_timeout, 600.0))
+    http_client = httpx.AsyncClient(
+        proxy=None,
+        trust_env=False,
+        timeout=httpx.Timeout(_stt_http_timeout, connect=60.0),
+    )
+    try:
+        client = AsyncOpenAI(api_key=key, base_url=base, http_client=http_client)
+        # 未设置环境变量时默认 zh；显式设为空字符串则不传 language（部分硅基模型可能拒参）
+        _lr = os.getenv("VIDEO_STT_LANGUAGE")
+        if _lr is None:
+            lang = "zh"
+        else:
+            lang = _lr.strip() or None
+        texts: list[str] = []
+        total = len(chunks)
+        for idx, wav in enumerate(chunks, start=1):
+            try:
+                text = await _transcribe_single_wav(
+                    client,
+                    model=model,
+                    wav=wav,
+                    timeout_sec=_stt_http_timeout,
+                    language=lang,
+                )
+            except Exception as e:
+                logger.warning("VIDEO_STT: 片段转写失败 chunk=%s/%s err=%s", idx, total, e)
+                text = ""
+            if text:
+                texts.append(text)
+                logger.info("VIDEO_STT: 片段转写成功 chunk=%s/%s len=%s", idx, total, len(text))
+            else:
+                logger.warning("VIDEO_STT: 片段转写为空 chunk=%s/%s", idx, total)
+
+        merged = _join_transcript_parts(texts)
+        if merged:
+            logger.info("VIDEO_STT: 全片转写成功 chunks=%s total_len=%s model=%s", total, len(merged), model)
+        else:
+            logger.warning("VIDEO_STT: 全片转写为空，请核对 ASR 模型与音频内容")
+        return merged
+    except Exception as e:
+        logger.warning("VIDEO_STT: 转写 API 失败 %s", e)
+        return ""
+    finally:
+        await http_client.aclose()
+
+
+def _transcription_text_from_response(resp: object) -> str:
+    """
+    从 OpenAI SDK / 硅基 JSON 转写响应中取出纯文本。
+    @param resp - Transcription 对象或兼容结构
+    @returns 去首尾空白的转写文本
+    """
+    if resp is None:
+        return ""
+    segments = getattr(resp, "segments", None)
+    if isinstance(segments, list):
+        lines = [
+            str((seg or {}).get("text", "")).strip()
+            for seg in segments
+            if isinstance(seg, dict) and str((seg or {}).get("text", "")).strip()
+        ]
+        if lines:
+            return "\n".join(lines).strip()
+    t = getattr(resp, "text", None)
+    if isinstance(t, str) and t.strip():
+        return t.strip()
+    dump = getattr(resp, "model_dump", None)
+    if callable(dump):
+        try:
+            d = dump()
+            if isinstance(d, dict):
+                segs = d.get("segments")
+                if isinstance(segs, list):
+                    lines = [
+                        str((seg or {}).get("text", "")).strip()
+                        for seg in segs
+                        if isinstance(seg, dict) and str((seg or {}).get("text", "")).strip()
+                    ]
+                    if lines:
+                        return "\n".join(lines).strip()
+                tx = d.get("text")
+                if isinstance(tx, str) and tx.strip():
+                    return tx.strip()
+        except Exception:
+            pass
+    if isinstance(resp, dict):
+        segs = resp.get("segments")
+        if isinstance(segs, list):
+            lines = [
+                str((seg or {}).get("text", "")).strip()
+                for seg in segs
+                if isinstance(seg, dict) and str((seg or {}).get("text", "")).strip()
+            ]
+            if lines:
+                return "\n".join(lines).strip()
+        tx = resp.get("text")
+        if isinstance(tx, str) and tx.strip():
+            return tx.strip()
+    return ""
diff --git a/backend/app/api/screenshot_api.py b/backend/app/api/screenshot_api.py
index 87e908f..60d3ba6 100644
--- a/backend/app/api/screenshot_api.py
+++ b/backend/app/api/screenshot_api.py
@@ -18,6 +18,7 @@
 
 from app.agents.base_agent import _get_client, _is_mimo_openai_compat, _parse_json_from_llm_text
 from app.analysis.mimo_video import build_mimo_video_url_content_part
+from app.analysis.video_stt import transcribe_video_with_whisper
 from app.api.diagnose import (
     MAX_VIDEO_SIZE,
     MIME_TO_EXT,
@@ -412,11 +413,34 @@ def _content_text_looks_like_video_scene_caption(text: str) -> bool:
     return False
 
 
+def _strip_video_scene_caption_lines(text: str) -> str:
+    """
+    清除内容中的「画面描述型」旁白行，仅保留逐字字幕/口播文本。
+    """
+    s = str(text or "").strip()
+    if not s:
+        return ""
+
+    lines = [ln.strip() for ln in s.splitlines() if ln.strip()]
+    if not lines:
+        return ""
+
+    kept = [ln for ln in lines if not _content_text_looks_like_video_scene_caption(ln)]
+    if kept:
+        return "\n".join(kept).strip()
+    return ""
+
+
 def _sanitize_video_meta_narrative_content(result: dict) -> None:
-    """若正文被填成画面叙述而非字幕原文，清空 content_text，便于触发抽帧/OCR。"""
+    """
+    若正文混入画面叙述旁白，移除旁白行，仅保留逐字字幕/口播。
+    旧逻辑会整段清空，可能误伤已并入的 STT 正文。
+    """
     ct = str(result.get("content_text", "")).strip()
-    if ct and _content_text_looks_like_video_scene_caption(ct):
-        result["content_text"] = ""
+    if not ct:
+        return
+    cleaned = _strip_video_scene_caption_lines(ct)
+    result["content_text"] = cleaned
 
 
 def _normalize_quick_recognition_fields(
@@ -466,7 +490,7 @@ def _video_subtitle_payload_insufficient(result: dict) -> bool:
     视频快识：无可用字幕正文（含模型把画面说明误填进 content_text）时需抽帧或 OCR。
     """
     ct = str(result.get("content_text", "")).strip()
-    if ct and _content_text_looks_like_video_scene_caption(ct):
+    if ct and not _strip_video_scene_caption_lines(ct):
         return True
     return _quick_payload_is_empty(result)
 
@@ -530,6 +554,23 @@ def _merge_subtitle_transcript_into_result(result: dict, lines: list[str]) -> No
         )
 
 
+def _merge_stt_into_video_result(result: dict, stt: str) -> None:
+    """将 Whisper 口播转写并入 content_text（与画面字幕互补）。"""
+    text = (stt or "").strip()
+    if not text:
+        return
+    prev_raw = str(result.get("content_text", "")).strip()
+    prev = _strip_video_scene_caption_lines(prev_raw)
+    if not prev:
+        result["content_text"] = text
+        return
+    if text in prev or prev in text:
+        if len(text) > len(prev):
+            result["content_text"] = text
+        return
+    result["content_text"] = f"{prev}\n\n{text}".strip()
+
+
 async def _video_url_quick_call(client, video_url: str) -> dict:
     """
     通过 MiMo 视频理解（video_url content part）请求模型，返回与快识相同结构的 JSON。
@@ -628,11 +669,40 @@ async def _video_url_subtitle_transcript_call(client, video_url: str) -> list[st
     return _parse_subtitle_lines_payload(parsed)
 
 
+def _video_title_body_same_short_hook(result: dict) -> bool:
+    """
+    标题与正文是否为同一句短花字（常见于视频首帧钩子），用于触发首帧 OCR 补全。
+    """
+    tt = str(result.get("title", "")).strip()
+    ct = str(result.get("content_text", "")).strip()
+    return bool(tt and ct and tt == ct and len(ct) <= 40)
+
+
+def _ocr_supplement_already_sufficient(title_text: str, content_text: str) -> bool:
+    """
+    判断快识是否已足够完整，可跳过首帧 OCR 补全。
+
+    视频场景里模型常把同一句花字同时填进标题与正文（如「注意看」），若二者非空即跳过 OCR，
+    则永远无法用首帧 OCR 拉长正文，口播 ASR 又失败时界面会一直只有三个字。
+    """
+    ct = (content_text or "").strip()
+    tt = (title_text or "").strip()
+    if not ct or not tt:
+        return False
+    if tt == ct and len(ct) <= 40:
+        return False
+    if len(ct) >= 52:
+        return True
+    if tt != ct and len(ct) >= 32:
+        return True
+    return False
+
+
 async def _ocr_supplement_quick_result(client, image_bytes: bytes, result: dict, ocr_cap: int) -> None:
-    """title/content 缺省时用 OCR 补全（与图片快识一致）。"""
+    """title/content 缺省或过短时用 OCR 补全（与图片快识一致）。"""
     content_text = str(result.get("content_text", "")).strip()
     title_text = str(result.get("title", "")).strip()
-    if content_text and title_text:
+    if _ocr_supplement_already_sufficient(title_text, content_text):
         return
     try:
         from app.analysis.ocr_processor import OCRProcessor
@@ -777,6 +847,8 @@ async def quick_recognize_video(request: Request, file: UploadFile = File(...)):
     quick_max_out = _quick_image_max_out_tokens()
     ocr_cap = _quick_ocr_max_tokens()
 
+    stt_task = asyncio.create_task(transcribe_video_with_whisper(video_bytes, container_ext))
+
     result: dict = {}
     video_url_mimo: Optional[str] = None
     url_diag = get_public_base_url_diagnostics(request)
@@ -851,11 +923,51 @@ async def quick_recognize_video(request: Request, file: UploadFile = File(...)):
         not str(result.get("title", "")).strip()
         or not str(result.get("content_text", "")).strip()
         or _content_text_looks_like_video_scene_caption(str(result.get("content_text", "")).strip())
+        or _video_title_body_same_short_hook(result)
     ):
         frame_jpeg = _extract_first_video_frame(video_bytes, container_ext)
     if frame_jpeg:
         await _ocr_supplement_quick_result(client, frame_jpeg, result, ocr_cap)
 
+    stt_text = ""
+    try:
+        _stt_t = float(os.getenv("VIDEO_STT_TIMEOUT_SEC", "240"))
+    except ValueError:
+        _stt_t = 240.0
+    stt_timeout = max(30.0, min(_stt_t, 600.0))
+    try:
+        stt_text = await asyncio.wait_for(stt_task, timeout=stt_timeout)
+    except asyncio.TimeoutError:
+        logger.warning("VIDEO_STT: Whisper 等待超时（%.0fs）", stt_timeout)
+        stt_task.cancel()
+        try:
+            await stt_task
+        except asyncio.CancelledError:
+            pass
+    except Exception as e:
+        logger.warning("VIDEO_STT: 合并前异常 %s", e)
+
+    _prev_ct_len = len(str(result.get("content_text", "") or ""))
+    _merge_stt_into_video_result(result, stt_text)
+    _after_ct_len = len(str(result.get("content_text", "") or ""))
+    logger.info(
+        "VIDEO_STT: 口播合并 prev_content_len=%s stt_len=%s merged_content_len=%s",
+        _prev_ct_len,
+        len((stt_text or "").strip()),
+        _after_ct_len,
+    )
+    _stt_env_on = os.getenv("VIDEO_STT_ENABLED", "0").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+        "on",
+    )
+    if not (stt_text or "").strip() and _stt_env_on:
+        logger.warning(
+            "VIDEO_STT: 口播转写为空，正文仍主要来自视频模型/OCR；"
+            "请看上方 VIDEO_STT 日志（ffmpeg、API、代理已改为 trust_env=False 直连）",
+        )
+
     _sanitize_video_meta_narrative_content(result)
 
     if _quick_payload_is_empty(result):
diff --git a/backend/tests/test_video_text_merge.py b/backend/tests/test_video_text_merge.py
new file mode 100644
index 0000000..c799c2a
--- /dev/null
+++ b/backend/tests/test_video_text_merge.py
@@ -0,0 +1,32 @@
+"""
+视频快识正文清洗/合并逻辑测试。
+"""
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from app.api.screenshot_api import (
+    _strip_video_scene_caption_lines,
+    _merge_stt_into_video_result,
+    _video_subtitle_payload_insufficient,
+)
+
+
+def test_strip_scene_caption_lines_keeps_real_transcript():
+    text = "视频帧显示一位女生在厨房做饭\n这就是我家餐桌上出现率最高的一道菜"
+    cleaned = _strip_video_scene_caption_lines(text)
+    assert "视频帧显示" not in cleaned
+    assert "这就是我家餐桌上出现率最高的一道菜" in cleaned
+
+
+def test_merge_stt_replaces_scene_caption_only_payload():
+    result = {"content_text": "视频展示了一位博主并叠加字幕提示不要焯水"}
+    _merge_stt_into_video_result(result, "切记不要焯水\n这样更脆更香")
+    assert "视频展示了" not in result["content_text"]
+    assert "切记不要焯水" in result["content_text"]
+
+
+def test_video_payload_insufficient_when_only_scene_caption():
+    result = {"content_text": "视频帧显示一位女士在厨房烹饪蘑菇，并叠加字幕提示不要焯水"}
+    assert _video_subtitle_payload_insufficient(result) is True
diff --git a/frontend/src/utils/api.ts b/frontend/src/utils/api.ts
index 44e43c0..d69142b 100644
--- a/frontend/src/utils/api.ts
+++ b/frontend/src/utils/api.ts
@@ -402,7 +402,11 @@ export async function quickRecognizeVideo(file: File): Promise<QuickRecognizeRes
   const { data } = await api.post<QuickRecognizeResult>(
     "/screenshot/quick-recognize-video",
     fd,
-    { headers: { "Content-Type": "multipart/form-data" }, timeout: 180000 }
+    {
+      headers: { "Content-Type": "multipart/form-data" },
+      /** 视频快识包含整段 STT，长视频可能需要数分钟 */
+      timeout: 600_000,
+    }
   );
   return data;
 }