Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@ export async function getTranslationPairsFromSourceCellQuery(
debug(`[getTranslationPairsFromSourceCellQuery] ✅ Adding direct result for ${cellId}`);
translationPairs.push({
cellId,
sourceCell: { cellId, content: searchResult.sourceContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
targetCell: { cellId, content: searchResult.targetContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
sourceCell: { cellId, content: searchResult.sourceContent, rawContent: searchResult.rawSourceContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
targetCell: { cellId, content: searchResult.targetContent, rawContent: searchResult.rawTargetContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
});
} else {
debug(`[getTranslationPairsFromSourceCellQuery] ❌ Skipping ${cellId} - empty content after trim`);
Expand All @@ -278,8 +278,8 @@ export async function getTranslationPairsFromSourceCellQuery(
debug(`[getTranslationPairsFromSourceCellQuery] ✅ Adding fetched result for ${cellId}`);
translationPairs.push({
cellId,
sourceCell: { cellId, content: translationPair.sourceContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
targetCell: { cellId, content: translationPair.targetContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
sourceCell: { cellId, content: translationPair.sourceContent, rawContent: translationPair.rawSourceContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
targetCell: { cellId, content: translationPair.targetContent, rawContent: translationPair.rawTargetContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
});
} else {
debug(`[getTranslationPairsFromSourceCellQuery] ❌ Skipping ${cellId} - no valid translation pair found`);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3038,6 +3038,8 @@ export class SQLiteIndexManager {
cellLabel: row.cell_label, // NO FALLBACK
sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
targetContent: returnRawContent && row.raw_target_content ? row.raw_target_content : row.target_content,
rawSourceContent: row.raw_source_content || row.source_content,
rawTargetContent: row.raw_target_content || row.target_content,
content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
uri: row.uri,
line: row.line,
Expand Down Expand Up @@ -3186,6 +3188,8 @@ export class SQLiteIndexManager {
cellLabel: row.cell_label, // NO FALLBACK
sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
targetContent: returnRawContent && rawTargetContent ? rawTargetContent : targetContent,
rawSourceContent: row.raw_source_content || row.source_content,
rawTargetContent: rawTargetContent || targetContent,
content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
uri: row.uri,
line: row.line,
Expand Down Expand Up @@ -3272,6 +3276,8 @@ export class SQLiteIndexManager {
cellLabel: row.cell_label || null,
sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
targetContent: returnRawContent && row.raw_target_content ? row.raw_target_content : row.target_content,
rawSourceContent: row.raw_source_content || row.source_content,
rawTargetContent: row.raw_target_content || row.target_content,
content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
uri: row.uri,
line: row.line,
Expand Down Expand Up @@ -3442,6 +3448,8 @@ export class SQLiteIndexManager {
cellLabel: row.cell_label, // NO FALLBACK - show raw value
sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
targetContent: returnRawContent && rawTargetContent ? rawTargetContent : targetContent,
rawSourceContent: row.raw_source_content || row.source_content,
rawTargetContent: rawTargetContent || targetContent,
content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
uri: row.uri,
line: row.line,
Expand Down
37 changes: 28 additions & 9 deletions src/providers/translationSuggestions/llmCompletion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,18 @@ export async function llmCompletion(
throw new Error(`No source content found for cell ${currentCellId}. The search index may be incomplete. Try running "Force Complete Rebuild" from the command palette.`);
}

// Sanitize HTML content to extract plain text (handles transcription spans, etc.)
// Convert source HTML into search-friendly plain text while preserving word
// boundaries that would otherwise be lost when tags are stripped.
const sanitizeHtmlContent = (html: string): string => {
if (!html) return '';
return html
.replace(/<sup[^>]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<sup[^>]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<sup[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<br\s*\/?>/gi, ' ')
.replace(/<\/p>/gi, ' ')
.replace(/<\/div>/gi, ' ')
.replace(/<\/li>/gi, ' ')
.replace(/<[^>]*>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
Expand All @@ -157,16 +161,32 @@ export async function llmCompletion(
.trim();
};

const sourceContent = validSourceCells
.map((cell) => sanitizeHtmlContent(cell!.content || ""))
const preserveHtmlInPrompt = Boolean(completionConfig.allowHtmlPredictions);
const searchSourceContent = validSourceCells
.map((cell) => sanitizeHtmlContent(cell?.rawContent || cell?.content || ""))
.join(" ");

const currentCellSourceContent = validSourceCells
.map((cell) => {
const rawSourceContent = cell?.rawContent || cell?.content || "";
if (!preserveHtmlInPrompt) {
return sanitizeHtmlContent(rawSourceContent);
}

return rawSourceContent
.replace(/<sup[^>]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, "")
.replace(/<sup[^>]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, "")
.replace(/<sup[^>]*>[\s\S]*?<\/sup>/gi, "")
.trim();
})
.join(preserveHtmlInPrompt ? "\n" : " ");

// Get few-shot examples (existing behavior encapsulated)
if (completionConfig.debugMode) {
console.debug(`[llmCompletion] Fetching few-shot examples with query: "${sourceContent}", cellId: ${currentCellId}, count: ${numberOfFewShotExamples}, onlyValidated: ${completionConfig.useOnlyValidatedExamples}`);
console.debug(`[llmCompletion] Fetching few-shot examples with query: "${searchSourceContent}", cellId: ${currentCellId}, count: ${numberOfFewShotExamples}, onlyValidated: ${completionConfig.useOnlyValidatedExamples}`);
}
const finalExamples = await fetchFewShotExamples(
sourceContent,
searchSourceContent,
currentCellId,
numberOfFewShotExamples,
completionConfig.useOnlyValidatedExamples
Expand Down Expand Up @@ -203,12 +223,11 @@ export async function llmCompletion(

try {
const currentCellIdString = currentCellIds.join(", ");
const currentCellSourceContent = sourceContent;

// Generate few-shot examples
const fewShotExamples = buildFewShotExamplesText(
finalExamples,
Boolean(completionConfig.allowHtmlPredictions),
finalExamples,
preserveHtmlInPrompt,
fewShotExampleFormat || "source-and-target"
);
console.log(`[llmCompletion] Built few-shot examples text (${fewShotExamples.length} chars, format: ${fewShotExampleFormat}):`, fewShotExamples.substring(0, 200) + '...');
Expand All @@ -222,7 +241,7 @@ export async function llmCompletion(
fewShotExamples,
precedingTranslationPairs,
currentCellSourceContent,
Boolean(completionConfig.allowHtmlPredictions),
preserveHtmlInPrompt,
fewShotExampleFormat || "source-and-target",
sourceLanguage
);
Expand Down
95 changes: 50 additions & 45 deletions src/providers/translationSuggestions/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export async function fetchFewShotExamples(
// Use a higher multiplier since many candidates may be incomplete pairs
const initialCandidateCount = Math.max(numberOfFewShotExamples * 10, 100);
console.debug(`[fetchFewShotExamples] Starting search with query: "${sourceContent}" (length: ${sourceContent?.length || 0}), requesting ${initialCandidateCount} candidates, validated only: ${useOnlyValidatedExamples}`);

let similarSourceCells: TranslationPair[] = [];
try {
similarSourceCells = await vscode.commands.executeCommand(
Expand Down Expand Up @@ -52,7 +52,7 @@ export async function fetchFewShotExamples(

// Instead of filtering, rank all valid complete pairs by relevance
const currentTokens = tokenizeText({ method: "whitespace_and_punctuation", text: sourceContent });

const rankedPairs = (similarSourceCells || [])
.filter((pair) => {
// Basic validity filters only
Expand All @@ -62,15 +62,15 @@ export async function fetchFewShotExamples(
}
return false;
}

// Must have both source and target content for complete pairs
const pairSourceContent = pair.sourceCell?.content || "";
const pairTargetContent = pair.targetCell?.content || "";
if (!pairSourceContent.trim() || !pairTargetContent.trim()) {
console.debug(`[fetchFewShotExamples] Filtering out pair ${pair.cellId} - incomplete pair (missing source or target)`);
return false;
}

return true;
})
.map((pair) => {
Expand All @@ -79,13 +79,13 @@ export async function fetchFewShotExamples(
const pairSourceContentRaw = pair.sourceCell?.content || "";
const pairSourceContentSanitized = sanitizeHtmlContent(pairSourceContentRaw);
const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContentSanitized });

// Calculate overlap ratio
const overlapCount = currentTokens.filter(token => pairTokens.includes(token)).length;
const overlapRatio = currentTokens.length > 0 ? overlapCount / currentTokens.length : 0;

console.debug(`[fetchFewShotExamples] Pair ${pair.cellId} - overlap: ${overlapCount}/${currentTokens.length} = ${(overlapRatio * 100).toFixed(1)}%`);

return {
pair,
overlapRatio,
Expand All @@ -99,23 +99,23 @@ export async function fetchFewShotExamples(
}
return b.overlapCount - a.overlapCount;
});

console.debug(`[fetchFewShotExamples] Ranked ${rankedPairs.length} complete pairs by relevance`);

// Take the top N most relevant complete pairs
const filteredSimilarSourceCells = rankedPairs
.slice(0, numberOfFewShotExamples)
.map(ranked => ranked.pair);

console.debug(`[fetchFewShotExamples] Returning ${filteredSimilarSourceCells.length} top-ranked examples (requested: ${numberOfFewShotExamples})`);

if (filteredSimilarSourceCells.length === 0) {
console.debug(`[fetchFewShotExamples] No complete translation pairs found. Source length: ${sourceContent?.length || 0}`);
console.debug(`[fetchFewShotExamples] Database may contain only incomplete pairs (source-only or target-only).`);
} else if (filteredSimilarSourceCells.length < numberOfFewShotExamples) {
console.debug(`[fetchFewShotExamples] Found fewer examples than requested: ${filteredSimilarSourceCells.length}/${numberOfFewShotExamples}`);
}

return filteredSimilarSourceCells;
}

Expand Down Expand Up @@ -176,21 +176,26 @@ export async function getPrecedingTranslationPairs(
}

export function buildFewShotExamplesText(
pairs: TranslationPair[],
allowHtml: boolean = false,
pairs: TranslationPair[],
allowHtml: boolean = false,
exampleFormat: string = "source-and-target"
): string {
console.debug(`[buildFewShotExamplesText] Building ${pairs.length} examples in '${exampleFormat}' format`);


const examplesInner = pairs
.map((pair) => {
const sourceRaw = pair.sourceCell?.content ?? "";
const targetRaw = pair.targetCell?.content ?? "";
.map((pair, idx) => {
const sourceRaw = allowHtml ? (pair.sourceCell?.rawContent || pair.sourceCell?.content || "") : (pair.sourceCell?.content ?? "");
const targetRaw = allowHtml ? (pair.targetCell?.rawContent || pair.targetCell?.content || "") : (pair.targetCell?.content ?? "");
const target = allowHtml ? targetRaw.trim() : stripHtmlTags(targetRaw).trim();
const source = allowHtml ? sourceRaw.trim() : stripHtmlTags(sourceRaw).trim();
if (allowHtml && idx < 3) {
const hasHtmlInTarget = /<[a-z][^>]*>/i.test(target);
const hasHtmlInSource = /<[a-z][^>]*>/i.test(source);

}
const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target);
const sourceInner = allowHtml ? wrapCdata(source) : xmlEscape(source);

// Format examples based on the setting
if (exampleFormat === "target-only") {
return `<example><target>${targetInner}</target></example>`;
Expand Down Expand Up @@ -218,44 +223,44 @@ export function buildMessages(
exampleFormat: string = "source-and-target",
sourceLanguage: string | null = null
): ChatMessage[] {
let systemMessage = chatSystemMessage || `You are a helpful assistant`;
const sourceLangText = sourceLanguage ? `${sourceLanguage}` : "the source language";
const targetLangText = targetLanguage || "the target language";

if (exampleFormat === "target-only") {
systemMessage += `\n\nReference translations are provided in XML <target> tags. Use these as examples of the translation style and patterns you should follow.`;
} else {
systemMessage += `\n\nInput sections for examples and context are provided in XML. Only use values within <source> and <target> tags.`;
// Build a focused system message: critical output format first, then translation guidance
const parts: string[] = [];

// User's custom instructions (from metadata.json) come first
if (chatSystemMessage) {
parts.push(chatSystemMessage);
}
// Preserve line breaks and specify output format
if (allowHtml) {
systemMessage += `\n\nYou may include inline HTML tags when appropriate (e.g., <span>, <i>, <b>) consistent with examples. Preserve original line breaks from <currentTask><source> by returning text with the same number of lines separated by newline characters.`;

// Translation direction and approach
parts.push(`Translate from ${sourceLangText} to ${targetLangText}. This may be an ultra-low resource language — follow the patterns, style, and vocabulary of the provided reference data closely. When in doubt, err on the side of literalness.`);

// HTML preservation — always instruct to preserve HTML based on source
parts.push(`If the source text contains HTML formatting (e.g., <span>, <i>, <b> tags), preserve that HTML structure in your translation. Match the formatting of the source.`);

// Line preservation
parts.push(`Preserve original line breaks from <currentTask><source> by returning text with the same number of lines.`);

// Output format
parts.push(`Wrap your final translation in <final_answer>...</final_answer> tags. Provide only the translation — no commentary, explanations, or metadata.`);

// Data format hint
if (exampleFormat === "target-only") {
parts.push(`Reference translations are provided in XML <target> tags. Use these as examples of the translation style and patterns to follow.`);
} else {
systemMessage += `\n\nReturn plain text only (no XML/HTML). Preserve original line breaks from <currentTask><source> by returning text with the same number of lines separated by newline characters.`;
}
const sourceLangText = sourceLanguage ? `from ${sourceLanguage} ` : "from the source language ";
systemMessage += `\n\nAlways translate ${sourceLangText}to the target language ${targetLanguage || ""
}, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`;

systemMessage += `\n\n1. Analyze the provided reference data to understand the translation patterns and style.`;
systemMessage += `\n2. Complete the partial or complete translation of the line.`;
systemMessage += `\n3. Ensure your translation fits seamlessly with the existing partial translation.`;
systemMessage += `\n4. Provide only the completed translation without any additional commentary or metadata.`;
systemMessage += `\n5. Translate only into the target language ${targetLanguage || ""}.`;
systemMessage += `\n6. Pay careful attention to the provided reference data.`;
systemMessage += `\n7. If in doubt, err on the side of literalness.`;
if (allowHtml) {
systemMessage += `\n8. If the project has any styles, return HTML with the appropriate tags or classes as per the examples in the translation memory.`;
parts.push(`Examples and context are provided in XML with <source> and <target> tags.`);
}

systemMessage += `\n\nWrap your final translation in <final_answer>...</final_answer> XML tags. Do not include any other XML tags in your response outside of these tags.`;
const systemMessage = parts.join("\n\n");

const contextXml = `<context>\n${precedingContextPairs.filter(Boolean).join("\n")}\n</context>`;
const currentTaskXml = allowHtml
? `<currentTask><source>${wrapCdata(currentCellSourceContent)}</source></currentTask>`
: `<currentTask><source>${xmlEscape(currentCellSourceContent)}</source></currentTask>`;

const userMessage = [
"## Instructions",
"Follow the translation patterns and style as shown.",
"## Translation Memory (XML)",
fewShotExamples,
"## Current Context (XML)",
Expand Down
Loading