genesis-ai-dev · dadukhankevin · Mar 12, 2026 · Mar 12, 2026 · Mar 13, 2026 · Mar 17, 2026
diff --git a/src/activationHelpers/contextAware/contentIndexes/indexes/search.ts b/src/activationHelpers/contextAware/contentIndexes/indexes/search.ts
@@ -265,8 +265,8 @@ export async function getTranslationPairsFromSourceCellQuery(
                 debug(`[getTranslationPairsFromSourceCellQuery] ✅ Adding direct result for ${cellId}`);
                 translationPairs.push({
                     cellId,
-                    sourceCell: { cellId, content: searchResult.sourceContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
-                    targetCell: { cellId, content: searchResult.targetContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
+                    sourceCell: { cellId, content: searchResult.sourceContent, rawContent: searchResult.rawSourceContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
+                    targetCell: { cellId, content: searchResult.targetContent, rawContent: searchResult.rawTargetContent, uri: searchResult.uri || "", line: searchResult.line || 0 },
                 });
             } else {
                 debug(`[getTranslationPairsFromSourceCellQuery] ❌ Skipping ${cellId} - empty content after trim`);
@@ -278,8 +278,8 @@ export async function getTranslationPairsFromSourceCellQuery(
                 debug(`[getTranslationPairsFromSourceCellQuery] ✅ Adding fetched result for ${cellId}`);
                 translationPairs.push({
                     cellId,
-                    sourceCell: { cellId, content: translationPair.sourceContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
-                    targetCell: { cellId, content: translationPair.targetContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
+                    sourceCell: { cellId, content: translationPair.sourceContent, rawContent: translationPair.rawSourceContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
+                    targetCell: { cellId, content: translationPair.targetContent, rawContent: translationPair.rawTargetContent, uri: translationPair.uri || "", line: translationPair.line || 0 },
                 });
             } else {
                 debug(`[getTranslationPairsFromSourceCellQuery] ❌ Skipping ${cellId} - no valid translation pair found`);

diff --git a/src/activationHelpers/contextAware/contentIndexes/indexes/sqliteIndex.ts b/src/activationHelpers/contextAware/contentIndexes/indexes/sqliteIndex.ts
@@ -3038,6 +3038,8 @@ export class SQLiteIndexManager {
                         cellLabel: row.cell_label, // NO FALLBACK
                         sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                         targetContent: returnRawContent && row.raw_target_content ? row.raw_target_content : row.target_content,
+                        rawSourceContent: row.raw_source_content || row.source_content,
+                        rawTargetContent: row.raw_target_content || row.target_content,
                         content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                         uri: row.uri,
                         line: row.line,
@@ -3186,6 +3188,8 @@ export class SQLiteIndexManager {
                     cellLabel: row.cell_label, // NO FALLBACK
                     sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                     targetContent: returnRawContent && rawTargetContent ? rawTargetContent : targetContent,
+                    rawSourceContent: row.raw_source_content || row.source_content,
+                    rawTargetContent: rawTargetContent || targetContent,
                     content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                     uri: row.uri,
                     line: row.line,
@@ -3272,6 +3276,8 @@ export class SQLiteIndexManager {
                             cellLabel: row.cell_label || null,
                             sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                             targetContent: returnRawContent && row.raw_target_content ? row.raw_target_content : row.target_content,
+                            rawSourceContent: row.raw_source_content || row.source_content,
+                            rawTargetContent: row.raw_target_content || row.target_content,
                             content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                             uri: row.uri,
                             line: row.line,
@@ -3442,6 +3448,8 @@ export class SQLiteIndexManager {
                             cellLabel: row.cell_label, // NO FALLBACK - show raw value
                             sourceContent: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                             targetContent: returnRawContent && rawTargetContent ? rawTargetContent : targetContent,
+                            rawSourceContent: row.raw_source_content || row.source_content,
+                            rawTargetContent: rawTargetContent || targetContent,
                             content: returnRawContent && row.raw_source_content ? row.raw_source_content : row.source_content,
                             uri: row.uri,
                             line: row.line,

diff --git a/src/providers/translationSuggestions/llmCompletion.ts b/src/providers/translationSuggestions/llmCompletion.ts
@@ -136,14 +136,18 @@ export async function llmCompletion(
             throw new Error(`No source content found for cell ${currentCellId}. The search index may be incomplete. Try running "Force Complete Rebuild" from the command palette.`);
         }
 
-        // Sanitize HTML content to extract plain text (handles transcription spans, etc.)
+        // Convert source HTML into search-friendly plain text while preserving word
+        // boundaries that would otherwise be lost when tags are stripped.
         const sanitizeHtmlContent = (html: string): string => {
             if (!html) return '';
             return html
                 .replace(/<sup[^>]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '')
                 .replace(/<sup[^>]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '')
                 .replace(/<sup[^>]*>[\s\S]*?<\/sup>/gi, '')
+                .replace(/<br\s*\/?>/gi, ' ')
                 .replace(/<\/p>/gi, ' ')
+                .replace(/<\/div>/gi, ' ')
+                .replace(/<\/li>/gi, ' ')
                 .replace(/<[^>]*>/g, '')
                 .replace(/&nbsp;/g, ' ')
                 .replace(/&amp;/g, '&')
@@ -157,16 +161,32 @@ export async function llmCompletion(
                 .trim();
         };
 
-        const sourceContent = validSourceCells
-            .map((cell) => sanitizeHtmlContent(cell!.content || ""))
+        const preserveHtmlInPrompt = Boolean(completionConfig.allowHtmlPredictions);
+        const searchSourceContent = validSourceCells
+            .map((cell) => sanitizeHtmlContent(cell?.rawContent || cell?.content || ""))
             .join(" ");
 
+        const currentCellSourceContent = validSourceCells
+            .map((cell) => {
+                const rawSourceContent = cell?.rawContent || cell?.content || "";
+                if (!preserveHtmlInPrompt) {
+                    return sanitizeHtmlContent(rawSourceContent);
+                }
+
+                return rawSourceContent
+                    .replace(/<sup[^>]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, "")
+                    .replace(/<sup[^>]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, "")
+                    .replace(/<sup[^>]*>[\s\S]*?<\/sup>/gi, "")
+                    .trim();
+            })
+            .join(preserveHtmlInPrompt ? "\n" : " ");
+
         // Get few-shot examples (existing behavior encapsulated)
         if (completionConfig.debugMode) {
-            console.debug(`[llmCompletion] Fetching few-shot examples with query: "${sourceContent}", cellId: ${currentCellId}, count: ${numberOfFewShotExamples}, onlyValidated: ${completionConfig.useOnlyValidatedExamples}`);
+            console.debug(`[llmCompletion] Fetching few-shot examples with query: "${searchSourceContent}", cellId: ${currentCellId}, count: ${numberOfFewShotExamples}, onlyValidated: ${completionConfig.useOnlyValidatedExamples}`);
         }
         const finalExamples = await fetchFewShotExamples(
-            sourceContent,
+            searchSourceContent,
             currentCellId,
             numberOfFewShotExamples,
             completionConfig.useOnlyValidatedExamples
@@ -203,12 +223,11 @@ export async function llmCompletion(
 
         try {
             const currentCellIdString = currentCellIds.join(", ");
-            const currentCellSourceContent = sourceContent;
 
             // Generate few-shot examples
             const fewShotExamples = buildFewShotExamplesText(
-                finalExamples, 
-                Boolean(completionConfig.allowHtmlPredictions), 
+                finalExamples,
+                preserveHtmlInPrompt,
                 fewShotExampleFormat || "source-and-target"
             );
             console.log(`[llmCompletion] Built few-shot examples text (${fewShotExamples.length} chars, format: ${fewShotExampleFormat}):`, fewShotExamples.substring(0, 200) + '...');
@@ -222,7 +241,7 @@ export async function llmCompletion(
                 fewShotExamples,
                 precedingTranslationPairs,
                 currentCellSourceContent,
-                Boolean(completionConfig.allowHtmlPredictions),
+                preserveHtmlInPrompt,
                 fewShotExampleFormat || "source-and-target",
                 sourceLanguage
             );

diff --git a/src/providers/translationSuggestions/shared.ts b/src/providers/translationSuggestions/shared.ts
@@ -14,7 +14,7 @@ export async function fetchFewShotExamples(
   // Use a higher multiplier since many candidates may be incomplete pairs
   const initialCandidateCount = Math.max(numberOfFewShotExamples * 10, 100);
   console.debug(`[fetchFewShotExamples] Starting search with query: "${sourceContent}" (length: ${sourceContent?.length || 0}), requesting ${initialCandidateCount} candidates, validated only: ${useOnlyValidatedExamples}`);
-  
+
   let similarSourceCells: TranslationPair[] = [];
   try {
     similarSourceCells = await vscode.commands.executeCommand(
@@ -52,7 +52,7 @@ export async function fetchFewShotExamples(
 
   // Instead of filtering, rank all valid complete pairs by relevance
   const currentTokens = tokenizeText({ method: "whitespace_and_punctuation", text: sourceContent });
-  
+
   const rankedPairs = (similarSourceCells || [])
     .filter((pair) => {
       // Basic validity filters only
@@ -62,15 +62,15 @@ export async function fetchFewShotExamples(
         }
         return false;
       }
-      
+
       // Must have both source and target content for complete pairs
       const pairSourceContent = pair.sourceCell?.content || "";
       const pairTargetContent = pair.targetCell?.content || "";
       if (!pairSourceContent.trim() || !pairTargetContent.trim()) {
         console.debug(`[fetchFewShotExamples] Filtering out pair ${pair.cellId} - incomplete pair (missing source or target)`);
         return false;
       }
-      
+
       return true;
     })
     .map((pair) => {
@@ -79,13 +79,13 @@ export async function fetchFewShotExamples(
       const pairSourceContentRaw = pair.sourceCell?.content || "";
       const pairSourceContentSanitized = sanitizeHtmlContent(pairSourceContentRaw);
       const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContentSanitized });
-      
+
       // Calculate overlap ratio
       const overlapCount = currentTokens.filter(token => pairTokens.includes(token)).length;
       const overlapRatio = currentTokens.length > 0 ? overlapCount / currentTokens.length : 0;
-      
+
       console.debug(`[fetchFewShotExamples] Pair ${pair.cellId} - overlap: ${overlapCount}/${currentTokens.length} = ${(overlapRatio * 100).toFixed(1)}%`);
-      
+
       return {
         pair,
         overlapRatio,
@@ -99,23 +99,23 @@ export async function fetchFewShotExamples(
       }
       return b.overlapCount - a.overlapCount;
     });
-  
+
   console.debug(`[fetchFewShotExamples] Ranked ${rankedPairs.length} complete pairs by relevance`);
-  
+
   // Take the top N most relevant complete pairs
   const filteredSimilarSourceCells = rankedPairs
     .slice(0, numberOfFewShotExamples)
     .map(ranked => ranked.pair);
 
   console.debug(`[fetchFewShotExamples] Returning ${filteredSimilarSourceCells.length} top-ranked examples (requested: ${numberOfFewShotExamples})`);
-  
+
   if (filteredSimilarSourceCells.length === 0) {
     console.debug(`[fetchFewShotExamples] No complete translation pairs found. Source length: ${sourceContent?.length || 0}`);
     console.debug(`[fetchFewShotExamples] Database may contain only incomplete pairs (source-only or target-only).`);
   } else if (filteredSimilarSourceCells.length < numberOfFewShotExamples) {
     console.debug(`[fetchFewShotExamples] Found fewer examples than requested: ${filteredSimilarSourceCells.length}/${numberOfFewShotExamples}`);
   }
-  
+
   return filteredSimilarSourceCells;
 }
 
@@ -176,21 +176,26 @@ export async function getPrecedingTranslationPairs(
 }
 
 export function buildFewShotExamplesText(
-  pairs: TranslationPair[], 
-  allowHtml: boolean = false, 
+  pairs: TranslationPair[],
+  allowHtml: boolean = false,
   exampleFormat: string = "source-and-target"
 ): string {
-  console.debug(`[buildFewShotExamplesText] Building ${pairs.length} examples in '${exampleFormat}' format`);
-  
+
+
   const examplesInner = pairs
-    .map((pair) => {
-      const sourceRaw = pair.sourceCell?.content ?? "";
-      const targetRaw = pair.targetCell?.content ?? "";
+    .map((pair, idx) => {
+      const sourceRaw = allowHtml ? (pair.sourceCell?.rawContent || pair.sourceCell?.content || "") : (pair.sourceCell?.content ?? "");
+      const targetRaw = allowHtml ? (pair.targetCell?.rawContent || pair.targetCell?.content || "") : (pair.targetCell?.content ?? "");
       const target = allowHtml ? targetRaw.trim() : stripHtmlTags(targetRaw).trim();
       const source = allowHtml ? sourceRaw.trim() : stripHtmlTags(sourceRaw).trim();
+      if (allowHtml && idx < 3) {
+        const hasHtmlInTarget = /<[a-z][^>]*>/i.test(target);
+        const hasHtmlInSource = /<[a-z][^>]*>/i.test(source);
+
+      }
       const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target);
       const sourceInner = allowHtml ? wrapCdata(source) : xmlEscape(source);
-      
+
       // Format examples based on the setting
       if (exampleFormat === "target-only") {
         return `<example><target>${targetInner}</target></example>`;
@@ -218,44 +223,44 @@ export function buildMessages(
   exampleFormat: string = "source-and-target",
   sourceLanguage: string | null = null
 ): ChatMessage[] {
-  let systemMessage = chatSystemMessage || `You are a helpful assistant`;
+  const sourceLangText = sourceLanguage ? `${sourceLanguage}` : "the source language";
+  const targetLangText = targetLanguage || "the target language";
 
-  if (exampleFormat === "target-only") {
-    systemMessage += `\n\nReference translations are provided in XML <target> tags. Use these as examples of the translation style and patterns you should follow.`;
-  } else {
-    systemMessage += `\n\nInput sections for examples and context are provided in XML. Only use values within <source> and <target> tags.`;
+  // Build a focused system message: critical output format first, then translation guidance
+  const parts: string[] = [];
+
+  // User's custom instructions (from metadata.json) come first
+  if (chatSystemMessage) {
+    parts.push(chatSystemMessage);
   }
-  // Preserve line breaks and specify output format
-  if (allowHtml) {
-    systemMessage += `\n\nYou may include inline HTML tags when appropriate (e.g., <span>, <i>, <b>) consistent with examples. Preserve original line breaks from <currentTask><source> by returning text with the same number of lines separated by newline characters.`;
+
+  // Translation direction and approach
+  parts.push(`Translate from ${sourceLangText} to ${targetLangText}. This may be an ultra-low resource language — follow the patterns, style, and vocabulary of the provided reference data closely. When in doubt, err on the side of literalness.`);
+
+  // HTML preservation — always instruct to preserve HTML based on source
+  parts.push(`If the source text contains HTML formatting (e.g., <span>, <i>, <b> tags), preserve that HTML structure in your translation. Match the formatting of the source.`);
+
+  // Line preservation
+  parts.push(`Preserve original line breaks from <currentTask><source> by returning text with the same number of lines.`);
+
+  // Output format
+  parts.push(`Wrap your final translation in <final_answer>...</final_answer> tags. Provide only the translation — no commentary, explanations, or metadata.`);
+
+  // Data format hint
+  if (exampleFormat === "target-only") {
+    parts.push(`Reference translations are provided in XML <target> tags. Use these as examples of the translation style and patterns to follow.`);
   } else {
-    systemMessage += `\n\nReturn plain text only (no XML/HTML). Preserve original line breaks from <currentTask><source> by returning text with the same number of lines separated by newline characters.`;
-  }
-  const sourceLangText = sourceLanguage ? `from ${sourceLanguage} ` : "from the source language ";
-  systemMessage += `\n\nAlways translate ${sourceLangText}to the target language ${targetLanguage || ""
-    }, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`;
-
-  systemMessage += `\n\n1. Analyze the provided reference data to understand the translation patterns and style.`;
-  systemMessage += `\n2. Complete the partial or complete translation of the line.`;
-  systemMessage += `\n3. Ensure your translation fits seamlessly with the existing partial translation.`;
-  systemMessage += `\n4. Provide only the completed translation without any additional commentary or metadata.`;
-  systemMessage += `\n5. Translate only into the target language ${targetLanguage || ""}.`;
-  systemMessage += `\n6. Pay careful attention to the provided reference data.`;
-  systemMessage += `\n7. If in doubt, err on the side of literalness.`;
-  if (allowHtml) {
-    systemMessage += `\n8. If the project has any styles, return HTML with the appropriate tags or classes as per the examples in the translation memory.`;
+    parts.push(`Examples and context are provided in XML with <source> and <target> tags.`);
   }
 
-  systemMessage += `\n\nWrap your final translation in <final_answer>...</final_answer> XML tags. Do not include any other XML tags in your response outside of these tags.`;
+  const systemMessage = parts.join("\n\n");
 
   const contextXml = `<context>\n${precedingContextPairs.filter(Boolean).join("\n")}\n</context>`;
   const currentTaskXml = allowHtml
     ? `<currentTask><source>${wrapCdata(currentCellSourceContent)}</source></currentTask>`
     : `<currentTask><source>${xmlEscape(currentCellSourceContent)}</source></currentTask>`;
 
   const userMessage = [
-    "## Instructions",
-    "Follow the translation patterns and style as shown.",
     "## Translation Memory (XML)",
     fewShotExamples,
     "## Current Context (XML)",