From 48d3d3778ec651620b02fe77491ce6160205cb5b Mon Sep 17 00:00:00 2001 From: dadukhankevin Date: Wed, 11 Mar 2026 23:54:46 -0500 Subject: [PATCH 1/5] Surface raw HTML content through translation pair pipeline for allowHtmlPredictions The allowHtmlPredictions toggle was incomplete: HTML was stripped at SQLite index time but the raw content (already stored in s_raw_content/t_raw_content columns) was never surfaced to the prompt builder. Now rawContent flows through MinimalCellResult so buildFewShotExamplesText can use HTML-preserving examples when the toggle is on. Co-Authored-By: Claude Opus 4.6 --- .../contextAware/contentIndexes/indexes/search.ts | 8 ++++---- .../contextAware/contentIndexes/indexes/sqliteIndex.ts | 8 ++++++++ src/providers/translationSuggestions/shared.ts | 4 ++-- types/index.d.ts | 1 + 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/activationHelpers/contextAware/contentIndexes/indexes/search.ts b/src/activationHelpers/contextAware/contentIndexes/indexes/search.ts index ffdf88433..4b76d8792 100644 --- a/src/activationHelpers/contextAware/contentIndexes/indexes/search.ts +++ b/src/activationHelpers/contextAware/contentIndexes/indexes/search.ts @@ -265,8 +265,8 @@ export async function getTranslationPairsFromSourceCellQuery( debug(`[getTranslationPairsFromSourceCellQuery] ✅ Adding direct result for ${cellId}`); translationPairs.push({ cellId, - sourceCell: { cellId, content: searchResult.sourceContent, uri: searchResult.uri || "", line: searchResult.line || 0 }, - targetCell: { cellId, content: searchResult.targetContent, uri: searchResult.uri || "", line: searchResult.line || 0 }, + sourceCell: { cellId, content: searchResult.sourceContent, rawContent: searchResult.rawSourceContent, uri: searchResult.uri || "", line: searchResult.line || 0 }, + targetCell: { cellId, content: searchResult.targetContent, rawContent: searchResult.rawTargetContent, uri: searchResult.uri || "", line: searchResult.line || 0 }, }); } else { debug(`[getTranslationPairsFromSourceCellQuery] ❌ Skipping ${cellId} - empty content after trim`); @@ -278,8 +278,8 @@ export async function getTranslationPairsFromSourceCellQuery( debug(`[getTranslationPairsFromSourceCellQuery] ✅ Adding fetched result for ${cellId}`); translationPairs.push({ cellId, - sourceCell: { cellId, content: translationPair.sourceContent, uri: translationPair.uri || "", line: translationPair.line || 0 }, - targetCell: { cellId, content: translationPair.targetContent, uri: translationPair.uri || "", line: translationPair.line || 0 }, + sourceCell: { cellId, content: translationPair.sourceContent, rawContent: translationPair.rawSourceContent, uri: translationPair.uri || "", line: translationPair.line || 0 }, + targetCell: { cellId, content: translationPair.targetContent, rawContent: translationPair.rawTargetContent, uri: translationPair.uri || "", line: translationPair.line || 0 }, }); } else { debug(`[getTranslationPairsFromSourceCellQuery] ❌ Skipping ${cellId} - no valid translation pair found`); diff --git a/src/activationHelpers/contextAware/contentIndexes/indexes/sqliteIndex.ts b/src/activationHelpers/contextAware/contentIndexes/indexes/sqliteIndex.ts index 400b7bee5..aed97c8f2 100644 --- a/src/activationHelpers/contextAware/contentIndexes/indexes/sqliteIndex.ts +++ b/src/activationHelpers/contextAware/contentIndexes/indexes/sqliteIndex.ts @@ -3038,6 +3038,8 @@ export class SQLiteIndexManager { cellLabel: row.cell_label, // NO FALLBACK sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, targetContent: returnRawContent && row.raw_target_content ? row.raw_target_content : row.target_content, + rawSourceContent: row.raw_source_content || row.source_content, + rawTargetContent: row.raw_target_content || row.target_content, content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, uri: row.uri, line: row.line, @@ -3186,6 +3188,8 @@ export class SQLiteIndexManager { cellLabel: row.cell_label, // NO FALLBACK sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, targetContent: returnRawContent && rawTargetContent ? rawTargetContent : targetContent, + rawSourceContent: row.raw_source_content || row.source_content, + rawTargetContent: rawTargetContent || targetContent, content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, uri: row.uri, line: row.line, @@ -3272,6 +3276,8 @@ export class SQLiteIndexManager { cellLabel: row.cell_label || null, sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, targetContent: returnRawContent && row.raw_target_content ? row.raw_target_content : row.target_content, + rawSourceContent: row.raw_source_content || row.source_content, + rawTargetContent: row.raw_target_content || row.target_content, content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, uri: row.uri, line: row.line, @@ -3442,6 +3448,8 @@ export class SQLiteIndexManager { cellLabel: row.cell_label, // NO FALLBACK - show raw value sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, targetContent: returnRawContent && rawTargetContent ? rawTargetContent : targetContent, + rawSourceContent: row.raw_source_content || row.source_content, + rawTargetContent: rawTargetContent || targetContent, content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content, uri: row.uri, line: row.line, diff --git a/src/providers/translationSuggestions/shared.ts b/src/providers/translationSuggestions/shared.ts index f43276e80..c2a121772 100644 --- a/src/providers/translationSuggestions/shared.ts +++ b/src/providers/translationSuggestions/shared.ts @@ -184,8 +184,8 @@ export function buildFewShotExamplesText( const examplesInner = pairs .map((pair) => { - const sourceRaw = pair.sourceCell?.content ?? ""; - const targetRaw = pair.targetCell?.content ?? ""; + const sourceRaw = allowHtml ? (pair.sourceCell?.rawContent || pair.sourceCell?.content || "") : (pair.sourceCell?.content ?? ""); + const targetRaw = allowHtml ? (pair.targetCell?.rawContent || pair.targetCell?.content || "") : (pair.targetCell?.content ?? ""); const target = allowHtml ? targetRaw.trim() : stripHtmlTags(targetRaw).trim(); const source = allowHtml ? sourceRaw.trim() : stripHtmlTags(sourceRaw).trim(); const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target); diff --git a/types/index.d.ts b/types/index.d.ts index 3b39faefc..d024c001f 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -517,6 +517,7 @@ type MiniSearchVerseResult = { type MinimalCellResult = { cellId?: string; content?: string; + rawContent?: string; uri?: string; line?: number; notebookId?: string; From e7f8e5ff6efd8ea7d54fd0dd4f8c556cde943614 Mon Sep 17 00:00:00 2001 From: dadukhankevin Date: Thu, 12 Mar 2026 17:38:13 -0500 Subject: [PATCH 2/5] Improve LLM instruction following and simplify system prompt - Restructure system message: consolidate 12 appended instructions into focused paragraphs, reduce noise and redundancy - Simplify HTML preservation: always instruct model to preserve HTML from source, toggle only controls whether HTML is present in examples/context - Fix temperature passthrough: always send configured temperature instead of silently dropping it for the default model - Remove redundant "Instructions" header from user message - Add diagnostic logging for system message and few-shot example HTML Co-Authored-By: Claude Opus 4.6 --- .../translationSuggestions/llmCompletion.ts | 1 + .../translationSuggestions/shared.ts | 63 ++++++++++--------- src/utils/llmUtils.ts | 3 +- src/utils/metadataManager.ts | 2 + 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/providers/translationSuggestions/llmCompletion.ts b/src/providers/translationSuggestions/llmCompletion.ts index 784dd9c67..4423925ac 100644 --- a/src/providers/translationSuggestions/llmCompletion.ts +++ b/src/providers/translationSuggestions/llmCompletion.ts @@ -216,6 +216,7 @@ export async function llmCompletion( // Build messages — buildMessages is the single source of truth for // system message construction. Pass the raw chatSystemMessage and let // buildMessages append instructions exactly once. + console.log(`[llmCompletion] System message from config (first 200 chars): "${chatSystemMessage?.substring(0, 200)}..."`); const messages = buildMessages( targetLanguage, chatSystemMessage, diff --git a/src/providers/translationSuggestions/shared.ts b/src/providers/translationSuggestions/shared.ts index c2a121772..c4d7c3533 100644 --- a/src/providers/translationSuggestions/shared.ts +++ b/src/providers/translationSuggestions/shared.ts @@ -180,14 +180,19 @@ export function buildFewShotExamplesText( allowHtml: boolean = false, exampleFormat: string = "source-and-target" ): string { - console.debug(`[buildFewShotExamplesText] Building ${pairs.length} examples in '${exampleFormat}' format`); - + console.debug(`[buildFewShotExamplesText] Building ${pairs.length} examples in '${exampleFormat}' format, allowHtml=${allowHtml}`); + const examplesInner = pairs - .map((pair) => { + .map((pair, idx) => { const sourceRaw = allowHtml ? (pair.sourceCell?.rawContent || pair.sourceCell?.content || "") : (pair.sourceCell?.content ?? ""); const targetRaw = allowHtml ? (pair.targetCell?.rawContent || pair.targetCell?.content || "") : (pair.targetCell?.content ?? ""); const target = allowHtml ? targetRaw.trim() : stripHtmlTags(targetRaw).trim(); const source = allowHtml ? sourceRaw.trim() : stripHtmlTags(sourceRaw).trim(); + if (allowHtml && idx < 3) { + const hasHtmlInTarget = /<[a-z][^>]*>/i.test(target); + const hasHtmlInSource = /<[a-z][^>]*>/i.test(source); + console.log(`[buildFewShotExamplesText] Example ${idx}: hasHtmlInSource=${hasHtmlInSource}, hasHtmlInTarget=${hasHtmlInTarget}, targetRawContent=${pair.targetCell?.rawContent ? 'present' : 'MISSING'}, target preview="${target.substring(0, 100)}"`); + } const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target); const sourceInner = allowHtml ? wrapCdata(source) : xmlEscape(source); @@ -218,35 +223,37 @@ export function buildMessages( exampleFormat: string = "source-and-target", sourceLanguage: string | null = null ): ChatMessage[] { - let systemMessage = chatSystemMessage || `You are a helpful assistant`; + const sourceLangText = sourceLanguage ? `${sourceLanguage}` : "the source language"; + const targetLangText = targetLanguage || "the target language"; - if (exampleFormat === "target-only") { - systemMessage += `\n\nReference translations are provided in XML tags. Use these as examples of the translation style and patterns you should follow.`; - } else { - systemMessage += `\n\nInput sections for examples and context are provided in XML. Only use values within and tags.`; + // Build a focused system message: critical output format first, then translation guidance + const parts: string[] = []; + + // User's custom instructions (from metadata.json) come first + if (chatSystemMessage) { + parts.push(chatSystemMessage); } - // Preserve line breaks and specify output format - if (allowHtml) { - systemMessage += `\n\nYou may include inline HTML tags when appropriate (e.g., , , ) consistent with examples. Preserve original line breaks from by returning text with the same number of lines separated by newline characters.`; + + // Translation direction and approach + parts.push(`Translate from ${sourceLangText} to ${targetLangText}. This may be an ultra-low resource language — follow the patterns, style, and vocabulary of the provided reference data closely. When in doubt, err on the side of literalness.`); + + // HTML preservation — always instruct to preserve HTML based on source + parts.push(`If the source text contains HTML formatting (e.g., , , tags), preserve that HTML structure in your translation. Match the formatting of the source.`); + + // Line preservation + parts.push(`Preserve original line breaks from by returning text with the same number of lines.`); + + // Output format + parts.push(`Wrap your final translation in ... tags. Provide only the translation — no commentary, explanations, or metadata.`); + + // Data format hint + if (exampleFormat === "target-only") { + parts.push(`Reference translations are provided in XML tags. Use these as examples of the translation style and patterns to follow.`); } else { - systemMessage += `\n\nReturn plain text only (no XML/HTML). Preserve original line breaks from by returning text with the same number of lines separated by newline characters.`; - } - const sourceLangText = sourceLanguage ? `from ${sourceLanguage} ` : "from the source language "; - systemMessage += `\n\nAlways translate ${sourceLangText}to the target language ${targetLanguage || "" - }, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`; - - systemMessage += `\n\n1. Analyze the provided reference data to understand the translation patterns and style.`; - systemMessage += `\n2. Complete the partial or complete translation of the line.`; - systemMessage += `\n3. Ensure your translation fits seamlessly with the existing partial translation.`; - systemMessage += `\n4. Provide only the completed translation without any additional commentary or metadata.`; - systemMessage += `\n5. Translate only into the target language ${targetLanguage || ""}.`; - systemMessage += `\n6. Pay careful attention to the provided reference data.`; - systemMessage += `\n7. If in doubt, err on the side of literalness.`; - if (allowHtml) { - systemMessage += `\n8. If the project has any styles, return HTML with the appropriate tags or classes as per the examples in the translation memory.`; + parts.push(`Examples and context are provided in XML with and tags.`); } - systemMessage += `\n\nWrap your final translation in ... XML tags. Do not include any other XML tags in your response outside of these tags.`; + const systemMessage = parts.join("\n\n"); const contextXml = `\n${precedingContextPairs.filter(Boolean).join("\n")}\n`; const currentTaskXml = allowHtml @@ -254,8 +261,6 @@ export function buildMessages( : `${xmlEscape(currentCellSourceContent)}`; const userMessage = [ - "## Instructions", - "Follow the translation patterns and style as shown.", "## Translation Memory (XML)", fewShotExamples, "## Current Context (XML)", diff --git a/src/utils/llmUtils.ts b/src/utils/llmUtils.ts index 85fddca98..b0553a15f 100644 --- a/src/utils/llmUtils.ts +++ b/src/utils/llmUtils.ts @@ -95,8 +95,7 @@ export async function callLLM( const completion = await openai.chat.completions.create({ model, messages: messages as ChatCompletionMessageParam[], - // Let the server decide temperature for the default model. - ...(model.toLowerCase() === "default" ? {} : (model.toLowerCase() === "gpt-5" ? { temperature: 1 } : { temperature: config.temperature })), + temperature: config.temperature, }, { signal: abortController.signal }); diff --git a/src/utils/metadataManager.ts b/src/utils/metadataManager.ts index bdcc0930d..219d6fff6 100644 --- a/src/utils/metadataManager.ts +++ b/src/utils/metadataManager.ts @@ -402,8 +402,10 @@ export class MetadataManager { if (result.success && result.metadata) { const chatSystemMessage = (result.metadata as any).chatSystemMessage as string | undefined; if (chatSystemMessage) { + console.log(`[MetadataManager.getChatSystemMessage] Returning stored message (first 100 chars): "${chatSystemMessage.substring(0, 100)}..."`); return chatSystemMessage; } + console.log(`[MetadataManager.getChatSystemMessage] No chatSystemMessage found in metadata.json, will try to generate`); } // Try to generate chatSystemMessage if it doesn't exist From f287432ae284d880f215619a3c3a1bc70fb4abf2 Mon Sep 17 00:00:00 2001 From: dadukhankevin Date: Fri, 13 Mar 2026 11:58:37 -0500 Subject: [PATCH 3/5] Preserve source HTML in LLM prompts Keep raw source HTML in the current-task prompt when HTML predictions are enabled while still using sanitized text for example search. Add regression coverage to ensure the prompt preserves source markup and spacing boundaries. Made-with: Cursor --- .../translationSuggestions/llmCompletion.ts | 35 ++++++++++++++----- .../suite/codexCellEditorProvider.test.ts | 26 +++++++++++--- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/providers/translationSuggestions/llmCompletion.ts b/src/providers/translationSuggestions/llmCompletion.ts index 4423925ac..2032337ac 100644 --- a/src/providers/translationSuggestions/llmCompletion.ts +++ b/src/providers/translationSuggestions/llmCompletion.ts @@ -136,14 +136,18 @@ export async function llmCompletion( throw new Error(`No source content found for cell ${currentCellId}. The search index may be incomplete. Try running "Force Complete Rebuild" from the command palette.`); } - // Sanitize HTML content to extract plain text (handles transcription spans, etc.) + // Convert source HTML into search-friendly plain text while preserving word + // boundaries that would otherwise be lost when tags are stripped. const sanitizeHtmlContent = (html: string): string => { if (!html) return ''; return html .replace(/]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '') .replace(/]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '') .replace(/]*>[\s\S]*?<\/sup>/gi, '') + .replace(//gi, ' ') .replace(/<\/p>/gi, ' ') + .replace(/<\/div>/gi, ' ') + .replace(/<\/li>/gi, ' ') .replace(/<[^>]*>/g, '') .replace(/ /g, ' ') .replace(/&/g, '&') @@ -157,16 +161,32 @@ export async function llmCompletion( .trim(); }; - const sourceContent = validSourceCells - .map((cell) => sanitizeHtmlContent(cell!.content || "")) + const preserveHtmlInPrompt = Boolean(completionConfig.allowHtmlPredictions); + const searchSourceContent = validSourceCells + .map((cell) => sanitizeHtmlContent(cell?.rawContent || cell?.content || "")) .join(" "); + const currentCellSourceContent = validSourceCells + .map((cell) => { + const rawSourceContent = cell?.rawContent || cell?.content || ""; + if (!preserveHtmlInPrompt) { + return sanitizeHtmlContent(rawSourceContent); + } + + return rawSourceContent + .replace(/]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, "") + .replace(/]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, "") + .replace(/]*>[\s\S]*?<\/sup>/gi, "") + .trim(); + }) + .join(preserveHtmlInPrompt ? "\n" : " "); + // Get few-shot examples (existing behavior encapsulated) if (completionConfig.debugMode) { - console.debug(`[llmCompletion] Fetching few-shot examples with query: "${sourceContent}", cellId: ${currentCellId}, count: ${numberOfFewShotExamples}, onlyValidated: ${completionConfig.useOnlyValidatedExamples}`); + console.debug(`[llmCompletion] Fetching few-shot examples with query: "${searchSourceContent}", cellId: ${currentCellId}, count: ${numberOfFewShotExamples}, onlyValidated: ${completionConfig.useOnlyValidatedExamples}`); } const finalExamples = await fetchFewShotExamples( - sourceContent, + searchSourceContent, currentCellId, numberOfFewShotExamples, completionConfig.useOnlyValidatedExamples @@ -203,12 +223,11 @@ export async function llmCompletion( try { const currentCellIdString = currentCellIds.join(", "); - const currentCellSourceContent = sourceContent; // Generate few-shot examples const fewShotExamples = buildFewShotExamplesText( finalExamples, - Boolean(completionConfig.allowHtmlPredictions), + preserveHtmlInPrompt, fewShotExampleFormat || "source-and-target" ); console.log(`[llmCompletion] Built few-shot examples text (${fewShotExamples.length} chars, format: ${fewShotExampleFormat}):`, fewShotExamples.substring(0, 200) + '...'); @@ -223,7 +242,7 @@ export async function llmCompletion( fewShotExamples, precedingTranslationPairs, currentCellSourceContent, - Boolean(completionConfig.allowHtmlPredictions), + preserveHtmlInPrompt, fewShotExampleFormat || "source-and-target", sourceLanguage ); diff --git a/src/test/suite/codexCellEditorProvider.test.ts b/src/test/suite/codexCellEditorProvider.test.ts index 29a2a5fcf..7079d0481 100644 --- a/src/test/suite/codexCellEditorProvider.test.ts +++ b/src/test/suite/codexCellEditorProvider.test.ts @@ -3865,7 +3865,7 @@ suite("CodexCellEditorProvider Test Suite", () => { ); const cellId = codexSubtitleContent.cells[0].metadata.id; - const sourceContent = "Test source content"; + const sourceContent = "

Test source content

"; // Track onlyValidated parameter let capturedOnlyValidated: boolean | null = null; @@ -3968,7 +3968,7 @@ suite("CodexCellEditorProvider Test Suite", () => { ); const cellId = codexSubtitleContent.cells[0].metadata.id; - const sourceContent = "Test source content"; + const sourceContent = "

Test source content

"; // Track onlyValidated parameter let capturedOnlyValidated: boolean | null = null; @@ -3982,7 +3982,13 @@ suite("CodexCellEditorProvider Test Suite", () => { return []; } if (command === "codex-editor-extension.getSourceCellByCellIdFromAllSourceCells") { - return { cellId: args[0], content: sourceContent, versions: [], notebookId: "nb1" } as MinimalCellResult; + return { + cellId: args[0], + content: sourceContent, + rawContent: sourceContent, + versions: [], + notebookId: "nb1", + } as MinimalCellResult; } return originalExecuteCommand.apply(vscode.commands, [command, ...args]); }; @@ -4207,7 +4213,7 @@ suite("CodexCellEditorProvider Test Suite", () => { ); const cellId = codexSubtitleContent.cells[0].metadata.id; - const sourceContent = "Test source content"; + const sourceContent = "

Test source content

"; const htmlExample = "HTML content"; // Mock translation pairs with HTML content @@ -4229,7 +4235,13 @@ suite("CodexCellEditorProvider Test Suite", () => { return mockTranslationPairs; } if (command === "codex-editor-extension.getSourceCellByCellIdFromAllSourceCells") { - return { cellId: args[0], content: sourceContent, versions: [], notebookId: "nb1" } as MinimalCellResult; + return { + cellId: args[0], + content: sourceContent, + rawContent: sourceContent, + versions: [], + notebookId: "nb1", + } as MinimalCellResult; } return originalExecuteCommand.apply(vscode.commands, [command, ...args]); }; @@ -4303,6 +4315,10 @@ suite("CodexCellEditorProvider Test Suite", () => { assert.ok(userMessage.content.includes("Test source content

]]>
"), + "Current task source should preserve source HTML when allowHtmlPredictions is enabled" + ); // Verify system message mentions HTML const systemMessage = (capturedMessages as any[]).find((m: any) => m.role === "system"); From e6c5932bad457ee5cadb17d34f60846d3c709d8d Mon Sep 17 00:00:00 2001 From: Ben Scholtens Date: Tue, 17 Mar 2026 17:30:25 -0400 Subject: [PATCH 4/5] remove logs --- .../translationSuggestions/llmCompletion.ts | 3 +- .../translationSuggestions/shared.ts | 32 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/providers/translationSuggestions/llmCompletion.ts b/src/providers/translationSuggestions/llmCompletion.ts index 2032337ac..047608f0b 100644 --- a/src/providers/translationSuggestions/llmCompletion.ts +++ b/src/providers/translationSuggestions/llmCompletion.ts @@ -226,7 +226,7 @@ export async function llmCompletion( // Generate few-shot examples const fewShotExamples = buildFewShotExamplesText( - finalExamples, + finalExamples, preserveHtmlInPrompt, fewShotExampleFormat || "source-and-target" ); @@ -235,7 +235,6 @@ export async function llmCompletion( // Build messages — buildMessages is the single source of truth for // system message construction. Pass the raw chatSystemMessage and let // buildMessages append instructions exactly once. - console.log(`[llmCompletion] System message from config (first 200 chars): "${chatSystemMessage?.substring(0, 200)}..."`); const messages = buildMessages( targetLanguage, chatSystemMessage, diff --git a/src/providers/translationSuggestions/shared.ts b/src/providers/translationSuggestions/shared.ts index c4d7c3533..0652528f9 100644 --- a/src/providers/translationSuggestions/shared.ts +++ b/src/providers/translationSuggestions/shared.ts @@ -14,7 +14,7 @@ export async function fetchFewShotExamples( // Use a higher multiplier since many candidates may be incomplete pairs const initialCandidateCount = Math.max(numberOfFewShotExamples * 10, 100); console.debug(`[fetchFewShotExamples] Starting search with query: "${sourceContent}" (length: ${sourceContent?.length || 0}), requesting ${initialCandidateCount} candidates, validated only: ${useOnlyValidatedExamples}`); - + let similarSourceCells: TranslationPair[] = []; try { similarSourceCells = await vscode.commands.executeCommand( @@ -52,7 +52,7 @@ export async function fetchFewShotExamples( // Instead of filtering, rank all valid complete pairs by relevance const currentTokens = tokenizeText({ method: "whitespace_and_punctuation", text: sourceContent }); - + const rankedPairs = (similarSourceCells || []) .filter((pair) => { // Basic validity filters only @@ -62,7 +62,7 @@ export async function fetchFewShotExamples( } return false; } - + // Must have both source and target content for complete pairs const pairSourceContent = pair.sourceCell?.content || ""; const pairTargetContent = pair.targetCell?.content || ""; @@ -70,7 +70,7 @@ export async function fetchFewShotExamples( console.debug(`[fetchFewShotExamples] Filtering out pair ${pair.cellId} - incomplete pair (missing source or target)`); return false; } - + return true; }) .map((pair) => { @@ -79,13 +79,13 @@ export async function fetchFewShotExamples( const pairSourceContentRaw = pair.sourceCell?.content || ""; const pairSourceContentSanitized = sanitizeHtmlContent(pairSourceContentRaw); const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContentSanitized }); - + // Calculate overlap ratio const overlapCount = currentTokens.filter(token => pairTokens.includes(token)).length; const overlapRatio = currentTokens.length > 0 ? overlapCount / currentTokens.length : 0; - + console.debug(`[fetchFewShotExamples] Pair ${pair.cellId} - overlap: ${overlapCount}/${currentTokens.length} = ${(overlapRatio * 100).toFixed(1)}%`); - + return { pair, overlapRatio, @@ -99,23 +99,23 @@ export async function fetchFewShotExamples( } return b.overlapCount - a.overlapCount; }); - + console.debug(`[fetchFewShotExamples] Ranked ${rankedPairs.length} complete pairs by relevance`); - + // Take the top N most relevant complete pairs const filteredSimilarSourceCells = rankedPairs .slice(0, numberOfFewShotExamples) .map(ranked => ranked.pair); console.debug(`[fetchFewShotExamples] Returning ${filteredSimilarSourceCells.length} top-ranked examples (requested: ${numberOfFewShotExamples})`); - + if (filteredSimilarSourceCells.length === 0) { console.debug(`[fetchFewShotExamples] No complete translation pairs found. Source length: ${sourceContent?.length || 0}`); console.debug(`[fetchFewShotExamples] Database may contain only incomplete pairs (source-only or target-only).`); } else if (filteredSimilarSourceCells.length < numberOfFewShotExamples) { console.debug(`[fetchFewShotExamples] Found fewer examples than requested: ${filteredSimilarSourceCells.length}/${numberOfFewShotExamples}`); } - + return filteredSimilarSourceCells; } @@ -176,11 +176,11 @@ export async function getPrecedingTranslationPairs( } export function buildFewShotExamplesText( - pairs: TranslationPair[], - allowHtml: boolean = false, + pairs: TranslationPair[], + allowHtml: boolean = false, exampleFormat: string = "source-and-target" ): string { - console.debug(`[buildFewShotExamplesText] Building ${pairs.length} examples in '${exampleFormat}' format, allowHtml=${allowHtml}`); + const examplesInner = pairs .map((pair, idx) => { @@ -191,11 +191,11 @@ export function buildFewShotExamplesText( if (allowHtml && idx < 3) { const hasHtmlInTarget = /<[a-z][^>]*>/i.test(target); const hasHtmlInSource = /<[a-z][^>]*>/i.test(source); - console.log(`[buildFewShotExamplesText] Example ${idx}: hasHtmlInSource=${hasHtmlInSource}, hasHtmlInTarget=${hasHtmlInTarget}, targetRawContent=${pair.targetCell?.rawContent ? 'present' : 'MISSING'}, target preview="${target.substring(0, 100)}"`); + } const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target); const sourceInner = allowHtml ? wrapCdata(source) : xmlEscape(source); - + // Format examples based on the setting if (exampleFormat === "target-only") { return `${targetInner}`; From e426f0c1a97e70734518f35caf785e5281b24a47 Mon Sep 17 00:00:00 2001 From: Ben Scholtens Date: Tue, 17 Mar 2026 23:07:38 -0400 Subject: [PATCH 5/5] Update CodexCellEditorProvider tests to enhance system message assertions - Adjusted assertions to verify that the system message includes the target language "fr" or "French". - Updated format instructions check to ensure it mentions HTML/formatting handling instead of just plain text when HTML is disabled. --- src/test/suite/codexCellEditorProvider.test.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/suite/codexCellEditorProvider.test.ts b/src/test/suite/codexCellEditorProvider.test.ts index 7079d0481..ad83f1831 100644 --- a/src/test/suite/codexCellEditorProvider.test.ts +++ b/src/test/suite/codexCellEditorProvider.test.ts @@ -3826,7 +3826,7 @@ suite("CodexCellEditorProvider Test Suite", () => { assert.ok(userMessage, "Should have a user message"); // Verify system message contains expected content - assert.ok(systemMessage.content.includes("target language"), "System message should mention target language"); + assert.ok(systemMessage.content.includes("fr") || systemMessage.content.includes("target language"), "System message should mention target language"); assert.ok(systemMessage.content.includes("fr") || systemMessage.content.includes("French"), "System message should include target language"); // Verify user message contains examples @@ -4180,10 +4180,10 @@ suite("CodexCellEditorProvider Test Suite", () => { "System message should contain translation instructions" ); - // Verify format instructions (plain text since allowHtmlPredictions is false) + // Verify format instructions (HTML preservation guidance is always included) assert.ok( - systemContent.includes("plain text") || systemContent.includes("no XML/HTML"), - "System message should mention plain text format when HTML is disabled" + systemContent.includes("HTML") || systemContent.includes("formatting"), + "System message should mention HTML/formatting handling" ); // Verify reference to examples/patterns