From df324801f859ebe8a5114f810abd58c70035dd02 Mon Sep 17 00:00:00 2001
From: "yangxinxin.24" <yangxinxin.24@bytedance.com>
Date: Thu, 19 Mar 2026 15:13:47 +0800
Subject: [PATCH 1/2] feat(embedder): use summary for file embedding in
 semantic pipeline

When files are processed through the semantic pipeline (SemanticDag),
use the pre-generated summary (AST skeleton or LLM summary) for
embedding instead of reading raw file content. This ensures code files,
markdown, and other text files within a repository are indexed by their
semantic summary rather than truncated raw content.

- Add use_summary flag to VectorizeTask, _vectorize_single_file, and vectorize_file
- Set use_summary=True in _file_summary_task when a non-empty summary is available
- Truncate AST skeleton to max_skeleton_chars (12000 chars, ~3000 tokens) before embedding
- Add max_skeleton_chars config field to SemanticConfig
- index_resource and memory paths are unaffected (use_summary defaults to False)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 openviking/storage/queuefs/semantic_dag.py    |  4 ++
 .../storage/queuefs/semantic_processor.py     |  5 +++
 openviking/utils/embedding_utils.py           | 40 +++++++++++--------
 openviking_cli/utils/config/parser_config.py  |  3 ++
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index 79213f71..0b9c1338 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -57,6 +57,7 @@ class VectorizeTask:
     file_path: Optional[str] = None
     summary_dict: Optional[Dict[str, str]] = None
     parent_uri: Optional[str] = None
+    use_summary: bool = False
     # For directory tasks
     abstract: Optional[str] = None
     overview: Optional[str] = None
@@ -192,6 +193,7 @@ async def wrapped_on_complete() -> None:
                             summary_dict=task.summary_dict,
                             ctx=task.ctx,
                             semantic_msg_id=task.semantic_msg_id,
+                            use_summary=task.use_summary,
                         )
                     )
                 else:
@@ -432,6 +434,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None:
 
         try:
             if need_vectorize:
+                use_summary = bool(summary_dict.get("summary"))
                 task = VectorizeTask(
                     task_type="file",
                     uri=file_path,
@@ -441,6 +444,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None:
                     file_path=file_path,
                     summary_dict=summary_dict,
                     parent_uri=parent_uri,
+                    use_summary=use_summary,
                 )
                 await self._add_vectorize_task(task)
         except Exception as e:
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 2b92b73b..f767a580 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -651,6 +651,9 @@ async def _generate_text_summary(
                 verbose = code_mode == "ast_llm"
                 skeleton_text = extract_skeleton(file_name, content, verbose=verbose)
                 if skeleton_text:
+                    max_skeleton_chars = get_openviking_config().semantic.max_skeleton_chars
+                    if len(skeleton_text) > max_skeleton_chars:
+                        skeleton_text = skeleton_text[:max_skeleton_chars]
                     if code_mode == "ast":
                         return {"name": file_name, "summary": skeleton_text}
                     else:  # ast_llm
@@ -1069,6 +1072,7 @@ async def _vectorize_single_file(
         summary_dict: Dict[str, str],
         ctx: Optional[RequestContext] = None,
         semantic_msg_id: Optional[str] = None,
+        use_summary: bool = False,
     ) -> None:
         """Vectorize a single file using its content or summary."""
         from openviking.utils.embedding_utils import vectorize_file
@@ -1081,4 +1085,5 @@ async def _vectorize_single_file(
             context_type=context_type,
             ctx=active_ctx,
             semantic_msg_id=semantic_msg_id,
+            use_summary=use_summary,
         )
diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py
index 9a2f9d2d..35aeadcc 100644
--- a/openviking/utils/embedding_utils.py
+++ b/openviking/utils/embedding_utils.py
@@ -213,12 +213,14 @@ async def vectorize_file(
     context_type: str = "resource",
     ctx: Optional[RequestContext] = None,
     semantic_msg_id: Optional[str] = None,
+    use_summary: bool = False,
 ) -> None:
     """
     Vectorize a single file.
 
     Creates Context object for the file and enqueues it.
-    Reads content for TEXT files, otherwise uses summary.
+    If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario).
+    Otherwise reads raw file content for TEXT files, falls back to summary on failure.
     """
     enqueued = False
 
@@ -260,21 +262,27 @@ async def vectorize_file(
                 )
                 return
         elif content_type == ResourceContentType.TEXT:
-            # For text files, try to read content
-            try:
-                content = await viking_fs.read_file(file_path, ctx=ctx)
-                if isinstance(content, bytes):
-                    content = content.decode("utf-8", errors="replace")
-                context.set_vectorize(Vectorize(text=content))
-            except Exception as e:
-                logger.warning(
-                    f"Failed to read file content for {file_path}, falling back to summary: {e}"
-                )
-                if summary:
-                    context.set_vectorize(Vectorize(text=summary))
-                else:
-                    logger.warning(f"No summary available for {file_path}, skipping vectorization")
-                    return
+            if use_summary and summary:
+                # Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding
+                context.set_vectorize(Vectorize(text=summary))
+            else:
+                # Default: read raw file content
+                try:
+                    content = await viking_fs.read_file(file_path, ctx=ctx)
+                    if isinstance(content, bytes):
+                        content = content.decode("utf-8", errors="replace")
+                    context.set_vectorize(Vectorize(text=content))
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to read file content for {file_path}, falling back to summary: {e}"
+                    )
+                    if summary:
+                        context.set_vectorize(Vectorize(text=summary))
+                    else:
+                        logger.warning(
+                            f"No summary available for {file_path}, skipping vectorization"
+                        )
+                        return
         elif summary:
             # For non-text files, use summary
             context.set_vectorize(Vectorize(text=summary))
diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py
index c8ff46aa..216cdad4 100644
--- a/openviking_cli/utils/config/parser_config.py
+++ b/openviking_cli/utils/config/parser_config.py
@@ -492,6 +492,9 @@ class SemanticConfig:
     max_file_content_chars: int = 30000
     """Maximum characters of file content sent to LLM for summary generation."""
 
+    max_skeleton_chars: int = 12000
+    """Maximum characters of AST skeleton used for embedding (~3000 tokens)."""
+
     max_overview_prompt_chars: int = 60000
     """Maximum characters allowed in the overview generation prompt.
     If exceeded, file summaries are batched and merged."""

From 61d03fc952b02f4b1f69e734b239e48cad73e3ed Mon Sep 17 00:00:00 2001
From: "yangxinxin.24" <yangxinxin.24@bytedance.com>
Date: Thu, 19 Mar 2026 17:48:12 +0800
Subject: [PATCH 2/2] fix(embedding): only use summary for code repo embedding,
 not plain text/doc files

Add `is_code_repo` flag to `SemanticMsg` and propagate it through the
pipeline so that summary-based embedding (AST skeleton) is only applied
when processing a code repository (`source_format == "repository"`).
For plain text, markdown, and other non-repo resources, raw file content
is used for embedding as before.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 openviking/storage/queuefs/semantic_dag.py       | 4 +++-
 openviking/storage/queuefs/semantic_msg.py       | 4 ++++
 openviking/storage/queuefs/semantic_processor.py | 1 +
 openviking/utils/resource_processor.py           | 2 ++
 openviking/utils/summarizer.py                   | 1 +
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index 0b9c1338..55cacc69 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -77,6 +77,7 @@ def __init__(
         semantic_msg_id: Optional[str] = None,
         recursive: bool = True,
         lifecycle_lock_handle_id: str = "",
+        is_code_repo: bool = False,
     ):
         self._processor = processor
         self._context_type = context_type
@@ -87,6 +88,7 @@ def __init__(
         self._semantic_msg_id = semantic_msg_id
         self._recursive = recursive
         self._lifecycle_lock_handle_id = lifecycle_lock_handle_id
+        self._is_code_repo = is_code_repo
         self._llm_sem = asyncio.Semaphore(max_concurrent_llm)
         self._viking_fs = get_viking_fs()
         self._nodes: Dict[str, DirNode] = {}
@@ -434,7 +436,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None:
 
         try:
             if need_vectorize:
-                use_summary = bool(summary_dict.get("summary"))
+                use_summary = self._is_code_repo and bool(summary_dict.get("summary"))
                 task = VectorizeTask(
                     task_type="file",
                     uri=file_path,
diff --git a/openviking/storage/queuefs/semantic_msg.py b/openviking/storage/queuefs/semantic_msg.py
index 720948e8..fbd56647 100644
--- a/openviking/storage/queuefs/semantic_msg.py
+++ b/openviking/storage/queuefs/semantic_msg.py
@@ -40,6 +40,7 @@ class SemanticMsg:
     telemetry_id: str = ""
     target_uri: str = ""
     lifecycle_lock_handle_id: str = ""
+    is_code_repo: bool = False
     changes: Optional[Dict[str, List[str]]] = (
         None  # {"added": [...], "modified": [...], "deleted": [...]}
     )
@@ -57,6 +58,7 @@ def __init__(
         telemetry_id: str = "",
         target_uri: str = "",
         lifecycle_lock_handle_id: str = "",
+        is_code_repo: bool = False,
         changes: Optional[Dict[str, List[str]]] = None,
     ):
         self.id = str(uuid4())
@@ -71,6 +73,7 @@ def __init__(
         self.telemetry_id = telemetry_id
         self.target_uri = target_uri
         self.lifecycle_lock_handle_id = lifecycle_lock_handle_id
+        self.is_code_repo = is_code_repo
         self.changes = changes
 
     def to_dict(self) -> Dict[str, Any]:
@@ -110,6 +113,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SemanticMsg":
             telemetry_id=data.get("telemetry_id", ""),
             target_uri=data.get("target_uri", ""),
             lifecycle_lock_handle_id=data.get("lifecycle_lock_handle_id", ""),
+            is_code_repo=data.get("is_code_repo", False),
             changes=data.get("changes"),
         )
         if "id" in data and data["id"]:
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index f767a580..cf164177 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -264,6 +264,7 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str,
                         semantic_msg_id=msg.id,
                         recursive=msg.recursive,
                         lifecycle_lock_handle_id=msg.lifecycle_lock_handle_id,
+                        is_code_repo=msg.is_code_repo,
                     )
                     self._dag_executor = executor
                     await executor.run(msg.uri)
diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py
index ef6185dc..9dd61f97 100644
--- a/openviking/utils/resource_processor.py
+++ b/openviking/utils/resource_processor.py
@@ -265,6 +265,7 @@ async def process_resource(
             should_summarize = summarize or build_index
             if should_summarize:
                 skip_vec = not build_index
+                is_code_repo = parse_result.source_format == "repository"
                 try:
                     with telemetry.measure("resource.summarize"):
                         await self._get_summarizer().summarize(
@@ -273,6 +274,7 @@ async def process_resource(
                             skip_vectorization=skip_vec,
                             lifecycle_lock_handle_id=lifecycle_lock_handle_id,
                             temp_uris=[temp_uri_for_summarize],
+                            is_code_repo=is_code_repo,
                             **kwargs,
                         )
                 except Exception as e:
diff --git a/openviking/utils/summarizer.py b/openviking/utils/summarizer.py
index e9a1cb20..dff099e6 100644
--- a/openviking/utils/summarizer.py
+++ b/openviking/utils/summarizer.py
@@ -74,6 +74,7 @@ async def summarize(
                 telemetry_id=telemetry.telemetry_id if telemetry.enabled else "",
                 target_uri=uri if uri != temp_uri else None,
                 lifecycle_lock_handle_id=lifecycle_lock_handle_id,
+                is_code_repo=kwargs.get("is_code_repo", False),
             )
             await semantic_queue.enqueue(msg)
             enqueued_count += 1