From 43f6eb093e7daf9a7c539005516315559be9c49a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Pacanovsk=C3=BD?= Date: Mon, 30 Mar 2026 22:21:00 +0200 Subject: [PATCH 1/4] Warning for importing corrupted vtt/srt file When time stamps are not sequential and currupted, user will be notified with yellow box stating the problem with the file during the import process. The user may still proceed. --- .../suite/vttRoundtrip.integration.test.ts | 2 +- .../subtitles/SubtitlesImporterForm.tsx | 18 ++- .../importers/subtitles/index.test.ts | 133 +++++++++++++++++- .../importers/subtitles/index.ts | 70 +++++++++ 4 files changed, 220 insertions(+), 3 deletions(-) diff --git a/src/test/suite/vttRoundtrip.integration.test.ts b/src/test/suite/vttRoundtrip.integration.test.ts index 6208c113a..95056a349 100644 --- a/src/test/suite/vttRoundtrip.integration.test.ts +++ b/src/test/suite/vttRoundtrip.integration.test.ts @@ -105,7 +105,7 @@ suite("VTT round-trip integration (mock VTT fixtures)", function () { }); assert.ok(alignedTextCell, "Expected at least one aligned text cell"); - const alignedStart = alignedTextCell!.notebookCell.metadata.data.startTime as number; + const alignedStart = alignedTextCell!.notebookCell!.metadata!.data!.startTime as number; const expectedContent = importedByStart.get(alignedStart); assert.ok(expectedContent, "Expected imported content for aligned text cell"); assert.strictEqual( diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/SubtitlesImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/SubtitlesImporterForm.tsx index 083e846cc..70ceee2ee 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/SubtitlesImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/SubtitlesImporterForm.tsx @@ -24,7 +24,7 @@ import { AlertCircle, } from "lucide-react"; import { Badge } from "../../../components/ui/badge"; -import { subtitlesImporter } from "./index"; +import { subtitlesImporter, validateSubtitleTimestamps } from "./index"; import { subtitlesImporterPlugin } from "./index.tsx"; import { handleImportCompletion, notebookToImportedContent } from "../common/translationHelper"; import { notifyImportStarted, notifyImportEnded } from "../../utils/importProgress"; @@ -45,6 +45,7 @@ export const SubtitlesImporterForm: React.FC = (props) = const [result, setResult] = useState(null); const [alignedCells, setAlignedCells] = useState(null); const [previewContent, setPreviewContent] = useState(""); + const [fileWarnings, setFileWarnings] = useState([]); const [subtitleStats, setSubtitleStats] = useState<{ totalCues: number; duration: string; @@ -64,6 +65,7 @@ export const SubtitlesImporterForm: React.FC = (props) = setResult(null); setAlignedCells(null); setSubtitleStats(null); + setFileWarnings([]); // Show preview and analyze file try { @@ -103,6 +105,9 @@ export const SubtitlesImporterForm: React.FC = (props) = duration, format, }); + + const timestampWarnings = validateSubtitleTimestamps(text); + setFileWarnings(timestampWarnings); } catch (err) { console.warn("Could not preview file:", err); } @@ -581,6 +586,17 @@ export const SubtitlesImporterForm: React.FC = (props) = )} + {fileWarnings.length > 0 && ( + + + + {fileWarnings.map((warning, index) => ( +

{warning}

+ ))} +
+
+ )} + {previewContent && ( diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts index f14be1fbc..dd23b18d2 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { subtitlesImporter } from './index'; +import { subtitlesImporter, validateSubtitleTimestamps } from './index'; // Minimal File-like shim for tests class MockFile { @@ -41,4 +41,135 @@ describe('subtitlesImporter.parseFile', () => { }); }); +describe('validateSubtitleTimestamps', () => { + it('returns no warnings for a well-ordered VTT', () => { + const vtt = [ + 'WEBVTT', + '', + '1', + '00:00:01.000 --> 00:00:03.000', + 'First cue', + '', + '2', + '00:00:04.000 --> 00:00:06.000', + 'Second cue', + '', + '3', + '00:00:07.000 --> 00:00:09.000', + 'Third cue', + ].join('\n'); + expect(validateSubtitleTimestamps(vtt)).toEqual([]); + }); + + it('returns no warnings for a well-ordered SRT', () => { + const srt = [ + '1', + '00:00:01,000 --> 00:00:03,000', + 'First cue', + '', + '2', + '00:00:04,000 --> 00:00:06,000', + 'Second cue', + ].join('\n'); + + expect(validateSubtitleTimestamps(srt)).toEqual([]); + }); + + it('ignores small overlaps from multi-speaker cues (<5s)', () => { + const vtt = [ + 'WEBVTT', + '', + '1', + '00:00:10.000 --> 00:00:14.000', + 'Speaker A', + '', + '1', + '00:00:10.000 --> 00:00:14.000', + 'Speaker B (same timestamp, overlap of 4s)', + ].join('\n'); + + expect(validateSubtitleTimestamps(vtt)).toEqual([]); + }); + + it('warns when timestamps jump backwards significantly (corrupted hour)', () => { + const vtt = [ + 'WEBVTT', + '', + '1', + '01:00:23.625 --> 01:00:28.458', + 'Cue with wrong hour', + '', + '2', + '00:00:52.886 --> 00:00:54.763', + 'Cue with correct hour (jumps back ~3573s)', + ].join('\n'); + + const warnings = validateSubtitleTimestamps(vtt); + expect(warnings.length).toBe(1); + expect(warnings[0]).toMatch(/non-sequential timestamps/); + expect(warnings[0]).toMatch(/1 hour/); + }); + + it('counts multiple out-of-order cues', () => { + const vtt = [ + 'WEBVTT', + '', + '1', + '01:00:23.000 --> 01:00:28.000', + 'Wrong hour', + '', + '2', + '00:00:30.000 --> 00:00:35.000', + 'Correct (jump back)', + '', + '3', + '01:00:40.000 --> 01:00:45.000', + 'Wrong hour again', + '', + '4', + '00:00:50.000 --> 00:00:55.000', + 'Correct (jump back again)', + ].join('\n'); + + const warnings = validateSubtitleTimestamps(vtt); + expect(warnings.length).toBe(1); + expect(warnings[0]).toMatch(/2 subtitle cue/); + }); + + it('reports minutes for moderate jumps', () => { + const vtt = [ + 'WEBVTT', + '', + '1', + '00:05:00.000 --> 00:05:30.000', + 'Later cue first', + '', + '2', + '00:01:00.000 --> 00:01:05.000', + 'Earlier cue second (jumps back ~270s)', + ].join('\n'); + + const warnings = validateSubtitleTimestamps(vtt); + expect(warnings.length).toBe(1); + expect(warnings[0]).toMatch(/\d+ minutes/); + }); + + it('reports seconds for small jumps', () => { + const vtt = [ + 'WEBVTT', + '', + '1', + '00:00:30.000 --> 00:00:40.000', + 'Later cue first', + '', + '2', + '00:00:10.000 --> 00:00:15.000', + 'Earlier cue second (jumps back ~30s)', + ].join('\n'); + + const warnings = validateSubtitleTimestamps(vtt); + expect(warnings.length).toBe(1); + expect(warnings[0]).toMatch(/\d+ seconds/); + }); +}); diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts index de280e1fe..d865fe974 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts @@ -20,6 +20,69 @@ import { createSubtitleCellMetadata } from './cellMetadata'; const SUPPORTED_EXTENSIONS = ['vtt', 'srt', 'ass', 'sub']; +/** + * Threshold in seconds for flagging a timestamp as significantly out-of-order. + * Small overlaps (<5s) are common in multi-speaker subtitle files and are ignored. + */ +const OUT_OF_ORDER_THRESHOLD_SECONDS = 5; + +/** + * Scans raw subtitle text for non-sequential timestamps. + * Returns an array of warning strings (empty if no issues found). + * Works with both VTT (`.` separator) and SRT (`,` separator) formats. + */ +export const validateSubtitleTimestamps = (content: string): string[] => { + const warnings: string[] = []; + const timestampRegex = + /(\d{2}):(\d{2}):(\d{2})[.,](\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})[.,](\d{3})/g; + + let match: RegExpExecArray | null; + let prevEndTime = -1; + let cueIndex = 0; + const majorJumps: { cueIndex: number; jumpBackSeconds: number }[] = []; + + while ((match = timestampRegex.exec(content)) !== null) { + cueIndex++; + const startTime = + parseInt(match[1]) * 3600 + + parseInt(match[2]) * 60 + + parseInt(match[3]) + + parseInt(match[4]) / 1000; + const endTime = + parseInt(match[5]) * 3600 + + parseInt(match[6]) * 60 + + parseInt(match[7]) + + parseInt(match[8]) / 1000; + + if (prevEndTime >= 0 && startTime < prevEndTime) { + const jumpBack = prevEndTime - startTime; + if (jumpBack > OUT_OF_ORDER_THRESHOLD_SECONDS) { + majorJumps.push({ cueIndex, jumpBackSeconds: jumpBack }); + } + } + prevEndTime = endTime; + } + + if (majorJumps.length > 0) { + const maxJump = Math.max(...majorJumps.map((j) => j.jumpBackSeconds)); + const formattedMaxJump = + maxJump >= 3000 + ? `${Math.round(maxJump / 3600)} hour(s)` + : maxJump >= 120 + ? `${Math.round(maxJump / 60)} minutes` + : `${Math.round(maxJump)} seconds`; + + warnings.push( + `Found ${majorJumps.length} subtitle cue(s) with non-sequential timestamps ` + + `(jumping backwards by up to ${formattedMaxJump}). ` + + `This typically indicates corrupted timing data (e.g., incorrect hour values). ` + + `The imported content may not be in the correct order.` + ); + } + + return warnings; +}; + /** * Parses SRT content and converts it to VTT-like structure */ @@ -117,6 +180,10 @@ const validateFile = async (file: File): Promise => { warnings.push('No timestamp patterns found - this may not be a subtitle file'); } + // Check for non-sequential timestamps + const timestampWarnings = validateSubtitleTimestamps(content); + warnings.push(...timestampWarnings); + } catch (error) { errors.push('Could not read file content'); } @@ -187,6 +254,8 @@ export const parseFile = async ( throw new Error('No subtitle cues found in the file'); } + const timestampWarnings = validateSubtitleTimestamps(content); + onProgress?.(createProgress('Creating Cells', 'Creating notebook cells...', 70)); // Create notebook cells using ProcessedCell format with cellMetadata @@ -273,6 +342,7 @@ export const parseFile = async ( return { success: true, notebookPair: notebookPairWithMilestones, + warnings: timestampWarnings.length > 0 ? timestampWarnings : undefined, metadata: { segmentCount: sourceNotebook.cells.length, format, From 19faaae12c5cc6bed6d4e10282f9d22a4b8b7280 Mon Sep 17 00:00:00 2001 From: Ben Scholtens Date: Tue, 31 Mar 2026 11:33:20 -0400 Subject: [PATCH 2/4] Refactor subtitle timestamp validation logic - Removed the OUT_OF_ORDER_THRESHOLD_SECONDS constant and adjusted the logic to flag any backward jumps in timestamps. - Improved formatting of warning messages for better readability. - Ensured that the validation function continues to provide accurate warnings for non-sequential timestamps in subtitle files. --- .../importers/subtitles/index.ts | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts index d865fe974..be8bbff92 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts @@ -20,12 +20,6 @@ import { createSubtitleCellMetadata } from './cellMetadata'; const SUPPORTED_EXTENSIONS = ['vtt', 'srt', 'ass', 'sub']; -/** - * Threshold in seconds for flagging a timestamp as significantly out-of-order. - * Small overlaps (<5s) are common in multi-speaker subtitle files and are ignored. - */ -const OUT_OF_ORDER_THRESHOLD_SECONDS = 5; - /** * Scans raw subtitle text for non-sequential timestamps. * Returns an array of warning strings (empty if no issues found). @@ -39,7 +33,7 @@ export const validateSubtitleTimestamps = (content: string): string[] => { let match: RegExpExecArray | null; let prevEndTime = -1; let cueIndex = 0; - const majorJumps: { cueIndex: number; jumpBackSeconds: number }[] = []; + const majorJumps: { cueIndex: number; jumpBackSeconds: number; }[] = []; while ((match = timestampRegex.exec(content)) !== null) { cueIndex++; @@ -56,7 +50,7 @@ export const validateSubtitleTimestamps = (content: string): string[] => { if (prevEndTime >= 0 && startTime < prevEndTime) { const jumpBack = prevEndTime - startTime; - if (jumpBack > OUT_OF_ORDER_THRESHOLD_SECONDS) { + if (jumpBack > 0) { majorJumps.push({ cueIndex, jumpBackSeconds: jumpBack }); } } @@ -69,14 +63,14 @@ export const validateSubtitleTimestamps = (content: string): string[] => { maxJump >= 3000 ? `${Math.round(maxJump / 3600)} hour(s)` : maxJump >= 120 - ? `${Math.round(maxJump / 60)} minutes` - : `${Math.round(maxJump)} seconds`; + ? `${Math.round(maxJump / 60)} minutes` + : `${Math.round(maxJump)} seconds`; warnings.push( `Found ${majorJumps.length} subtitle cue(s) with non-sequential timestamps ` + - `(jumping backwards by up to ${formattedMaxJump}). ` + - `This typically indicates corrupted timing data (e.g., incorrect hour values). ` + - `The imported content may not be in the correct order.` + `(jumping backwards by up to ${formattedMaxJump}). ` + + `This typically indicates corrupted timing data (e.g., incorrect hour values). ` + + `The imported content may not be in the correct order.` ); } From 2f0b677677f8a9e6bf579c71e6ab3638836937c2 Mon Sep 17 00:00:00 2001 From: Ben Scholtens Date: Tue, 31 Mar 2026 11:37:17 -0400 Subject: [PATCH 3/4] Update test description for subtitle timestamp validation - Changed the test case description to clarify that it now catches small overlaps from multi-speaker cues instead of ignoring them. --- .../src/NewSourceUploader/importers/subtitles/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts index dd23b18d2..3ceffbcf4 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts @@ -76,7 +76,7 @@ describe('validateSubtitleTimestamps', () => { expect(validateSubtitleTimestamps(srt)).toEqual([]); }); - it('ignores small overlaps from multi-speaker cues (<5s)', () => { + it('catches small overlaps from multi-speaker cues (<5s)', () => { const vtt = [ 'WEBVTT', '', From 411f597e5f4bf278a8962f898ca0addf972e9770 Mon Sep 17 00:00:00 2001 From: Ben Scholtens Date: Tue, 31 Mar 2026 11:42:13 -0400 Subject: [PATCH 4/4] Enhance subtitle timestamp validation test - Updated the test to check for specific warnings related to non-sequential timestamps, ensuring it captures cases with significant overlaps in speaker cues. --- .../src/NewSourceUploader/importers/subtitles/index.test.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts index 3ceffbcf4..5db3762e9 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.test.ts @@ -89,7 +89,10 @@ describe('validateSubtitleTimestamps', () => { 'Speaker B (same timestamp, overlap of 4s)', ].join('\n'); - expect(validateSubtitleTimestamps(vtt)).toEqual([]); + const warnings = validateSubtitleTimestamps(vtt); + expect(warnings.length).toBe(1); + expect(warnings[0]).toMatch(/non-sequential timestamps/); + expect(warnings[0]).toMatch(/4 seconds/); }); it('warns when timestamps jump backwards significantly (corrupted hour)', () => {