From d1d91a36c03734ebedc35e8f4a90e70058d8dfbf Mon Sep 17 00:00:00 2001 From: yxz <3978401510@qq.com@example.com> Date: Thu, 9 Apr 2026 22:12:03 +0800 Subject: [PATCH] =?UTF-8?q?feat(video):=20=E5=8F=A3=E6=92=AD=20ASR?= =?UTF-8?q?=E3=80=81=E6=AD=A3=E6=96=87=E5=90=88=E5=B9=B6=E4=B8=8E=E5=BF=AB?= =?UTF-8?q?=E8=AF=86=E8=B6=85=E6=97=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 video_stt:硅基/OpenAI 兼容转写,httpx 直连与超时,响应解析与日志 - 视频快识:剔除旁白行后再合并 STT;标题正文同短句时仍抽帧 OCR;专向听写不足判定与单测 - 前端 quickRecognizeVideo 超时延长至 10 分钟以覆盖长视频 STT - .env.example 补充 TeleSpeechASR 与转写接口说明 Made-with: Cursor --- .env.example | 28 ++ backend/app/analysis/video_stt.py | 401 +++++++++++++++++++++++++ backend/app/api/screenshot_api.py | 124 +++++++- backend/tests/test_video_text_merge.py | 32 ++ frontend/src/utils/api.ts | 6 +- 5 files changed, 584 insertions(+), 7 deletions(-) create mode 100644 backend/app/analysis/video_stt.py create mode 100644 backend/tests/test_video_text_merge.py diff --git a/.env.example b/.env.example index 1def101..41c2fc9 100644 --- a/.env.example +++ b/.env.example @@ -90,3 +90,31 @@ TEMP_VIDEO_TTL_SECONDS=900 # 可选:临时视频落盘目录 # TEMP_VIDEO_DIR=backend/data/temp_videos + +# === 视频口播转写(OpenAI 兼容 /audio/transcriptions,非 TTS)=== +# 需本机安装 ffmpeg。只要填了 OPENAI_WHISPER_BASE_URL,就必须填 OPENAI_WHISPER_API_KEY(勿用 MiMo Key)。 +# 默认开启(代码默认值为 1);可显式关闭: +# VIDEO_STT_ENABLED=0 +# VIDEO_STT_ENABLED=1 +# +# OpenAI 官方 Whisper: +# OPENAI_WHISPER_BASE_URL=https://api.openai.com/v1 +# OPENAI_WHISPER_API_KEY=sk-... +# WHISPER_MODEL=whisper-1 +# +# 硅基流动 ASR(创建语音转文本,勿与 TTS「上传参考音频」upload-voice 混淆): +# 文档:https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions +# upload-voice 文档:https://docs.siliconflow.cn/cn/api-reference/audio/upload-voice (仅 TTS 音色,不用于 ASR) +# OPENAI_WHISPER_BASE_URL=https://api.siliconflow.cn/v1 +# OPENAI_WHISPER_API_KEY=<硅基控制台 API Key> +# WHISPER_MODEL=TeleAI/TeleSpeechASR +# WHISPER_MODEL=FunAudioLLM/SenseVoiceSmall +# 硅基部分模型可不传 language:VIDEO_STT_LANGUAGE= (空) +# +# VIDEO_STT_MAX_AUDIO_SECONDS=600 +# 长视频分段转写(秒),默认 480;调小更稳但请求次数更多 +# VIDEO_STT_SEGMENT_SECONDS=480 +# VIDEO_STT_TIMEOUT_SEC=240 +# VIDEO_STT_LANGUAGE=zh +# 优先请求 verbose_json(带 segments);网关不支持会自动回退 +# VIDEO_STT_PREFER_VERBOSE_JSON=1 diff --git a/backend/app/analysis/video_stt.py b/backend/app/analysis/video_stt.py new file mode 100644 index 0000000..8c5d39e --- /dev/null +++ b/backend/app/analysis/video_stt.py @@ -0,0 +1,401 @@ +""" +视频口播转写(Speech-to-Text):从音轨提取 WAV 后调用 **OpenAI 兼容** 的 `/v1/audio/transcriptions`。 + +说明:用户口语中的「TTS」常混用;此处为 **ASR(语音→文字)**,不是文字转语音。 + +兼容: +- OpenAI 官方:`OPENAI_WHISPER_BASE_URL=https://api.openai.com/v1`,`WHISPER_MODEL=whisper-1` +- 硅基流动 ASR:`OPENAI_WHISPER_BASE_URL=https://api.siliconflow.cn/v1`,`WHISPER_MODEL` 如 `TeleAI/TeleSpeechASR`、`FunAudioLLM/SenseVoiceSmall`(走「创建语音转文本」/audio/transcriptions,非 TTS 的 upload-voice) + +依赖: +- 系统 PATH 中可用 `ffmpeg`(提取音轨) +- 配置 `OPENAI_WHISPER_API_KEY` + `OPENAI_WHISPER_BASE_URL`(勿用 MiMo 对话 Key 冒充第三方 ASR) +""" +from __future__ import annotations + +import asyncio +import io +import logging +import os +import subprocess +import tempfile +from typing import Optional + +logger = logging.getLogger("noterx.video_stt") + + +def _stt_enabled() -> bool: + # 默认开启:只要具备可用 ASR 配置就执行;可通过 VIDEO_STT_ENABLED=0 显式关闭 + v = os.getenv("VIDEO_STT_ENABLED", "1").strip().lower() + return v in ("1", "true", "yes", "on") + + +def _resolve_whisper_client_config() -> tuple[Optional[str], Optional[str]]: + """ + @returns (api_key, base_url) 若不可用则 (None, None) + """ + whisper_key = (os.getenv("OPENAI_WHISPER_API_KEY") or "").strip() + explicit = (os.getenv("OPENAI_WHISPER_BASE_URL") or "").strip().rstrip("/") + + if explicit: + # 任意第三方 / 官方 ASR 基址:必须用专用 Key,禁止回退到 MiMo 的 OPENAI_API_KEY + if not whisper_key: + logger.info( + "VIDEO_STT: 已设置 OPENAI_WHISPER_BASE_URL,请在 .env 填写 OPENAI_WHISPER_API_KEY", + ) + return None, None + return whisper_key, explicit + + key = whisper_key or (os.getenv("OPENAI_API_KEY") or "").strip() + if not key: + return None, None + + main = (os.getenv("OPENAI_BASE_URL") or "").strip().lower() + if "xiaomimimo" in main or "mimo-v2.com" in main: + logger.info( + "VIDEO_STT: 已配置 MiMo 为 OPENAI_BASE_URL;请设置 OPENAI_WHISPER_BASE_URL 与 OPENAI_WHISPER_API_KEY," + "例如官方 https://api.openai.com/v1 或硅基流动 https://api.siliconflow.cn/v1", + ) + return None, None + + base = (os.getenv("OPENAI_BASE_URL") or "https://api.openai.com/v1").strip().rstrip("/") + return key, base + + +def _probe_video_duration_seconds(video_path: str) -> Optional[float]: + """用 ffprobe 获取视频时长(秒)。""" + try: + proc = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + video_path, + ], + capture_output=True, + timeout=20, + check=False, + ) + if proc.returncode != 0: + return None + raw = (proc.stdout or b"").decode("utf-8", errors="replace").strip() + if not raw: + return None + dur = float(raw) + if dur <= 0: + return None + return dur + except Exception: + return None + + +def _extract_wav_segment(video_path: str, *, start_sec: float, clip_sec: float) -> bytes: + """ + 从视频中提取一个音频片段为 16kHz 单声道 WAV。 + @returns 空 bytes 表示失败 + """ + fd, wav_path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + try: + cmd = [ + "ffmpeg", + "-y", + "-ss", + f"{max(start_sec, 0):.3f}", + "-i", + video_path, + "-vn", + "-acodec", + "pcm_s16le", + "-ar", + "16000", + "-ac", + "1", + "-t", + f"{max(clip_sec, 1.0):.3f}", + wav_path, + ] + proc = subprocess.run( + cmd, + capture_output=True, + timeout=max(120, int(clip_sec * 1.5)), + check=False, + ) + if proc.returncode != 0: + err = (proc.stderr or b"").decode("utf-8", errors="replace")[:400] + logger.warning("VIDEO_STT: ffmpeg 片段提取失败 rc=%s %s", proc.returncode, err) + return b"" + with open(wav_path, "rb") as wf: + out = wf.read() + return out + except FileNotFoundError: + logger.warning("VIDEO_STT: 未找到 ffmpeg,请安装后加入 PATH") + return b"" + except subprocess.TimeoutExpired: + logger.warning("VIDEO_STT: ffmpeg 片段提取超时") + return b"" + except Exception as e: + logger.warning("VIDEO_STT: 片段提取异常 %s", e) + return b"" + finally: + if wav_path and os.path.exists(wav_path): + try: + os.remove(wav_path) + except OSError: + pass + + +def _extract_wav_chunks_from_video_bytes(video_bytes: bytes, container_suffix: str) -> list[bytes]: + """ + 用 ffmpeg 将视频音轨切分为多个 16kHz 单声道 WAV 片段(Whisper 友好)。 + 默认按分段转写,避免单文件过大导致接口拒收。 + @returns WAV 片段列表;空列表表示失败(无 ffmpeg / 无音轨) + """ + suffix = container_suffix if container_suffix.startswith(".") else f".{container_suffix}" + max_total_sec = int(os.getenv("VIDEO_STT_MAX_AUDIO_SECONDS", "3600")) + max_total_sec = max(60, min(max_total_sec, 14400)) + seg_sec = int(os.getenv("VIDEO_STT_SEGMENT_SECONDS", "480")) + seg_sec = max(30, min(seg_sec, 1200)) + + video_path = "" + chunks: list[bytes] = [] + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as vf: + vf.write(video_bytes) + video_path = vf.name + + duration = _probe_video_duration_seconds(video_path) + if duration is None: + # 无法探测时按上限兜底,至少尝试一次 + duration = float(max_total_sec) + target_total = min(duration, float(max_total_sec)) + if duration > max_total_sec: + logger.info( + "VIDEO_STT: 视频时长 %.1fs 超过上限 %ss,超出部分不转写", + duration, + max_total_sec, + ) + + seg_count = max(1, int((target_total + seg_sec - 1) // seg_sec)) + for idx in range(seg_count): + start_sec = idx * seg_sec + remain = target_total - start_sec + if remain <= 0: + break + clip_sec = min(seg_sec, remain) + wav = _extract_wav_segment(video_path, start_sec=start_sec, clip_sec=clip_sec) + if not wav: + continue + if len(wav) > 24 * 1024 * 1024: + logger.warning( + "VIDEO_STT: 片段 %s WAV 超过 24MB,建议调小 VIDEO_STT_SEGMENT_SECONDS", + idx + 1, + ) + chunks.append(wav) + return chunks + except Exception as e: + logger.warning("VIDEO_STT: 提取音轨异常 %s", e) + return [] + finally: + if video_path and os.path.exists(video_path): + try: + os.remove(video_path) + except OSError: + pass + + +def _join_transcript_parts(parts: list[str]) -> str: + """拼接多段转写,做轻量去重。""" + out: list[str] = [] + for part in parts: + text = (part or "").strip() + if not text: + continue + if not out: + out.append(text) + continue + prev = out[-1] + if text in prev: + continue + if prev in text: + out[-1] = text + continue + out.append(text) + return "\n".join(out).strip() + + +async def _transcribe_single_wav( + client, + *, + model: str, + wav: bytes, + timeout_sec: float, + language: Optional[str], +) -> str: + """ + 转写单个 wav 片段。优先尝试 verbose_json(可拿更完整分段),不支持则自动回退。 + """ + base_kwargs: dict = { + "model": model, + "timeout": timeout_sec, + } + if language: + base_kwargs["language"] = language + + prefer_verbose = os.getenv("VIDEO_STT_PREFER_VERBOSE_JSON", "1").strip().lower() in ( + "1", + "true", + "yes", + "on", + ) + + if prefer_verbose: + try: + buf = io.BytesIO(wav) + buf.name = "audio.wav" + resp = await client.audio.transcriptions.create( + **base_kwargs, + file=buf, + response_format="verbose_json", + ) + text = _transcription_text_from_response(resp) + if text: + return text + except Exception as e: + logger.info("VIDEO_STT: verbose_json 不可用,回退默认格式: %s", e) + + buf = io.BytesIO(wav) + buf.name = "audio.wav" + resp = await client.audio.transcriptions.create(**base_kwargs, file=buf) + return _transcription_text_from_response(resp) + + +async def transcribe_video_with_whisper(video_bytes: bytes, container_suffix: str) -> str: + """ + 异步:线程池提取 WAV + AsyncOpenAI Whisper 转写。 + @returns 转写文本,失败或未开启时为空字符串 + """ + if not _stt_enabled(): + return "" + + key, base = _resolve_whisper_client_config() + if not key or not base: + return "" + + model = (os.getenv("WHISPER_MODEL") or "whisper-1").strip() + + chunks = await asyncio.to_thread(_extract_wav_chunks_from_video_bytes, video_bytes, container_suffix) + if not chunks: + logger.warning( + "VIDEO_STT: WAV 片段为空,未调用转写 API(检查 ffmpeg、视频是否含音轨、上方 stderr 日志)", + ) + return "" + + import httpx + from openai import AsyncOpenAI + + _stt_http_timeout = float(os.getenv("VIDEO_STT_TIMEOUT_SEC", "240")) + _stt_http_timeout = max(60.0, min(_stt_http_timeout, 600.0)) + http_client = httpx.AsyncClient( + proxy=None, + trust_env=False, + timeout=httpx.Timeout(_stt_http_timeout, connect=60.0), + ) + try: + client = AsyncOpenAI(api_key=key, base_url=base, http_client=http_client) + # 未设置环境变量时默认 zh;显式设为空字符串则不传 language(部分硅基模型可能拒参) + _lr = os.getenv("VIDEO_STT_LANGUAGE") + if _lr is None: + lang = "zh" + else: + lang = _lr.strip() or None + texts: list[str] = [] + total = len(chunks) + for idx, wav in enumerate(chunks, start=1): + try: + text = await _transcribe_single_wav( + client, + model=model, + wav=wav, + timeout_sec=_stt_http_timeout, + language=lang, + ) + except Exception as e: + logger.warning("VIDEO_STT: 片段转写失败 chunk=%s/%s err=%s", idx, total, e) + text = "" + if text: + texts.append(text) + logger.info("VIDEO_STT: 片段转写成功 chunk=%s/%s len=%s", idx, total, len(text)) + else: + logger.warning("VIDEO_STT: 片段转写为空 chunk=%s/%s", idx, total) + + merged = _join_transcript_parts(texts) + if merged: + logger.info("VIDEO_STT: 全片转写成功 chunks=%s total_len=%s model=%s", total, len(merged), model) + else: + logger.warning("VIDEO_STT: 全片转写为空,请核对 ASR 模型与音频内容") + return merged + except Exception as e: + logger.warning("VIDEO_STT: 转写 API 失败 %s", e) + return "" + finally: + await http_client.aclose() + + +def _transcription_text_from_response(resp: object) -> str: + """ + 从 OpenAI SDK / 硅基 JSON 转写响应中取出纯文本。 + @param resp - Transcription 对象或兼容结构 + @returns 去首尾空白的转写文本 + """ + if resp is None: + return "" + segments = getattr(resp, "segments", None) + if isinstance(segments, list): + lines = [ + str((seg or {}).get("text", "")).strip() + for seg in segments + if isinstance(seg, dict) and str((seg or {}).get("text", "")).strip() + ] + if lines: + return "\n".join(lines).strip() + t = getattr(resp, "text", None) + if isinstance(t, str) and t.strip(): + return t.strip() + dump = getattr(resp, "model_dump", None) + if callable(dump): + try: + d = dump() + if isinstance(d, dict): + segs = d.get("segments") + if isinstance(segs, list): + lines = [ + str((seg or {}).get("text", "")).strip() + for seg in segs + if isinstance(seg, dict) and str((seg or {}).get("text", "")).strip() + ] + if lines: + return "\n".join(lines).strip() + tx = d.get("text") + if isinstance(tx, str) and tx.strip(): + return tx.strip() + except Exception: + pass + if isinstance(resp, dict): + segs = resp.get("segments") + if isinstance(segs, list): + lines = [ + str((seg or {}).get("text", "")).strip() + for seg in segs + if isinstance(seg, dict) and str((seg or {}).get("text", "")).strip() + ] + if lines: + return "\n".join(lines).strip() + tx = resp.get("text") + if isinstance(tx, str) and tx.strip(): + return tx.strip() + return "" diff --git a/backend/app/api/screenshot_api.py b/backend/app/api/screenshot_api.py index 87e908f..60d3ba6 100644 --- a/backend/app/api/screenshot_api.py +++ b/backend/app/api/screenshot_api.py @@ -18,6 +18,7 @@ from app.agents.base_agent import _get_client, _is_mimo_openai_compat, _parse_json_from_llm_text from app.analysis.mimo_video import build_mimo_video_url_content_part +from app.analysis.video_stt import transcribe_video_with_whisper from app.api.diagnose import ( MAX_VIDEO_SIZE, MIME_TO_EXT, @@ -412,11 +413,34 @@ def _content_text_looks_like_video_scene_caption(text: str) -> bool: return False +def _strip_video_scene_caption_lines(text: str) -> str: + """ + 清除内容中的「画面描述型」旁白行,仅保留逐字字幕/口播文本。 + """ + s = str(text or "").strip() + if not s: + return "" + + lines = [ln.strip() for ln in s.splitlines() if ln.strip()] + if not lines: + return "" + + kept = [ln for ln in lines if not _content_text_looks_like_video_scene_caption(ln)] + if kept: + return "\n".join(kept).strip() + return "" + + def _sanitize_video_meta_narrative_content(result: dict) -> None: - """若正文被填成画面叙述而非字幕原文,清空 content_text,便于触发抽帧/OCR。""" + """ + 若正文混入画面叙述旁白,移除旁白行,仅保留逐字字幕/口播。 + 旧逻辑会整段清空,可能误伤已并入的 STT 正文。 + """ ct = str(result.get("content_text", "")).strip() - if ct and _content_text_looks_like_video_scene_caption(ct): - result["content_text"] = "" + if not ct: + return + cleaned = _strip_video_scene_caption_lines(ct) + result["content_text"] = cleaned def _normalize_quick_recognition_fields( @@ -466,7 +490,7 @@ def _video_subtitle_payload_insufficient(result: dict) -> bool: 视频快识:无可用字幕正文(含模型把画面说明误填进 content_text)时需抽帧或 OCR。 """ ct = str(result.get("content_text", "")).strip() - if ct and _content_text_looks_like_video_scene_caption(ct): + if ct and not _strip_video_scene_caption_lines(ct): return True return _quick_payload_is_empty(result) @@ -530,6 +554,23 @@ def _merge_subtitle_transcript_into_result(result: dict, lines: list[str]) -> No ) +def _merge_stt_into_video_result(result: dict, stt: str) -> None: + """将 Whisper 口播转写并入 content_text(与画面字幕互补)。""" + text = (stt or "").strip() + if not text: + return + prev_raw = str(result.get("content_text", "")).strip() + prev = _strip_video_scene_caption_lines(prev_raw) + if not prev: + result["content_text"] = text + return + if text in prev or prev in text: + if len(text) > len(prev): + result["content_text"] = text + return + result["content_text"] = f"{prev}\n\n{text}".strip() + + async def _video_url_quick_call(client, video_url: str) -> dict: """ 通过 MiMo 视频理解(video_url content part)请求模型,返回与快识相同结构的 JSON。 @@ -628,11 +669,40 @@ async def _video_url_subtitle_transcript_call(client, video_url: str) -> list[st return _parse_subtitle_lines_payload(parsed) +def _video_title_body_same_short_hook(result: dict) -> bool: + """ + 标题与正文是否为同一句短花字(常见于视频首帧钩子),用于触发首帧 OCR 补全。 + """ + tt = str(result.get("title", "")).strip() + ct = str(result.get("content_text", "")).strip() + return bool(tt and ct and tt == ct and len(ct) <= 40) + + +def _ocr_supplement_already_sufficient(title_text: str, content_text: str) -> bool: + """ + 判断快识是否已足够完整,可跳过首帧 OCR 补全。 + + 视频场景里模型常把同一句花字同时填进标题与正文(如「注意看」),若二者非空即跳过 OCR, + 则永远无法用首帧 OCR 拉长正文,口播 ASR 又失败时界面会一直只有三个字。 + """ + ct = (content_text or "").strip() + tt = (title_text or "").strip() + if not ct or not tt: + return False + if tt == ct and len(ct) <= 40: + return False + if len(ct) >= 52: + return True + if tt != ct and len(ct) >= 32: + return True + return False + + async def _ocr_supplement_quick_result(client, image_bytes: bytes, result: dict, ocr_cap: int) -> None: - """title/content 缺省时用 OCR 补全(与图片快识一致)。""" + """title/content 缺省或过短时用 OCR 补全(与图片快识一致)。""" content_text = str(result.get("content_text", "")).strip() title_text = str(result.get("title", "")).strip() - if content_text and title_text: + if _ocr_supplement_already_sufficient(title_text, content_text): return try: from app.analysis.ocr_processor import OCRProcessor @@ -777,6 +847,8 @@ async def quick_recognize_video(request: Request, file: UploadFile = File(...)): quick_max_out = _quick_image_max_out_tokens() ocr_cap = _quick_ocr_max_tokens() + stt_task = asyncio.create_task(transcribe_video_with_whisper(video_bytes, container_ext)) + result: dict = {} video_url_mimo: Optional[str] = None url_diag = get_public_base_url_diagnostics(request) @@ -851,11 +923,51 @@ async def quick_recognize_video(request: Request, file: UploadFile = File(...)): not str(result.get("title", "")).strip() or not str(result.get("content_text", "")).strip() or _content_text_looks_like_video_scene_caption(str(result.get("content_text", "")).strip()) + or _video_title_body_same_short_hook(result) ): frame_jpeg = _extract_first_video_frame(video_bytes, container_ext) if frame_jpeg: await _ocr_supplement_quick_result(client, frame_jpeg, result, ocr_cap) + stt_text = "" + try: + _stt_t = float(os.getenv("VIDEO_STT_TIMEOUT_SEC", "240")) + except ValueError: + _stt_t = 240.0 + stt_timeout = max(30.0, min(_stt_t, 600.0)) + try: + stt_text = await asyncio.wait_for(stt_task, timeout=stt_timeout) + except asyncio.TimeoutError: + logger.warning("VIDEO_STT: Whisper 等待超时(%.0fs)", stt_timeout) + stt_task.cancel() + try: + await stt_task + except asyncio.CancelledError: + pass + except Exception as e: + logger.warning("VIDEO_STT: 合并前异常 %s", e) + + _prev_ct_len = len(str(result.get("content_text", "") or "")) + _merge_stt_into_video_result(result, stt_text) + _after_ct_len = len(str(result.get("content_text", "") or "")) + logger.info( + "VIDEO_STT: 口播合并 prev_content_len=%s stt_len=%s merged_content_len=%s", + _prev_ct_len, + len((stt_text or "").strip()), + _after_ct_len, + ) + _stt_env_on = os.getenv("VIDEO_STT_ENABLED", "0").strip().lower() in ( + "1", + "true", + "yes", + "on", + ) + if not (stt_text or "").strip() and _stt_env_on: + logger.warning( + "VIDEO_STT: 口播转写为空,正文仍主要来自视频模型/OCR;" + "请看上方 VIDEO_STT 日志(ffmpeg、API、代理已改为 trust_env=False 直连)", + ) + _sanitize_video_meta_narrative_content(result) if _quick_payload_is_empty(result): diff --git a/backend/tests/test_video_text_merge.py b/backend/tests/test_video_text_merge.py new file mode 100644 index 0000000..c799c2a --- /dev/null +++ b/backend/tests/test_video_text_merge.py @@ -0,0 +1,32 @@ +""" +视频快识正文清洗/合并逻辑测试。 +""" +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from app.api.screenshot_api import ( + _strip_video_scene_caption_lines, + _merge_stt_into_video_result, + _video_subtitle_payload_insufficient, +) + + +def test_strip_scene_caption_lines_keeps_real_transcript(): + text = "视频帧显示一位女生在厨房做饭\n这就是我家餐桌上出现率最高的一道菜" + cleaned = _strip_video_scene_caption_lines(text) + assert "视频帧显示" not in cleaned + assert "这就是我家餐桌上出现率最高的一道菜" in cleaned + + +def test_merge_stt_replaces_scene_caption_only_payload(): + result = {"content_text": "视频展示了一位博主并叠加字幕提示不要焯水"} + _merge_stt_into_video_result(result, "切记不要焯水\n这样更脆更香") + assert "视频展示了" not in result["content_text"] + assert "切记不要焯水" in result["content_text"] + + +def test_video_payload_insufficient_when_only_scene_caption(): + result = {"content_text": "视频帧显示一位女士在厨房烹饪蘑菇,并叠加字幕提示不要焯水"} + assert _video_subtitle_payload_insufficient(result) is True diff --git a/frontend/src/utils/api.ts b/frontend/src/utils/api.ts index 44e43c0..d69142b 100644 --- a/frontend/src/utils/api.ts +++ b/frontend/src/utils/api.ts @@ -402,7 +402,11 @@ export async function quickRecognizeVideo(file: File): Promise( "/screenshot/quick-recognize-video", fd, - { headers: { "Content-Type": "multipart/form-data" }, timeout: 180000 } + { + headers: { "Content-Type": "multipart/form-data" }, + /** 视频快识包含整段 STT,长视频可能需要数分钟 */ + timeout: 600_000, + } ); return data; }