From df324801f859ebe8a5114f810abd58c70035dd02 Mon Sep 17 00:00:00 2001 From: "yangxinxin.24" Date: Thu, 19 Mar 2026 15:13:47 +0800 Subject: [PATCH 1/2] feat(embedder): use summary for file embedding in semantic pipeline When files are processed through the semantic pipeline (SemanticDag), use the pre-generated summary (AST skeleton or LLM summary) for embedding instead of reading raw file content. This ensures code files, markdown, and other text files within a repository are indexed by their semantic summary rather than truncated raw content. - Add use_summary flag to VectorizeTask, _vectorize_single_file, and vectorize_file - Set use_summary=True in _file_summary_task when a non-empty summary is available - Truncate AST skeleton to max_skeleton_chars (12000 chars, ~3000 tokens) before embedding - Add max_skeleton_chars config field to SemanticConfig - index_resource and memory paths are unaffected (use_summary defaults to False) Co-Authored-By: Claude Sonnet 4.6 --- openviking/storage/queuefs/semantic_dag.py | 4 ++ .../storage/queuefs/semantic_processor.py | 5 +++ openviking/utils/embedding_utils.py | 40 +++++++++++-------- openviking_cli/utils/config/parser_config.py | 3 ++ 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 79213f71..0b9c1338 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -57,6 +57,7 @@ class VectorizeTask: file_path: Optional[str] = None summary_dict: Optional[Dict[str, str]] = None parent_uri: Optional[str] = None + use_summary: bool = False # For directory tasks abstract: Optional[str] = None overview: Optional[str] = None @@ -192,6 +193,7 @@ async def wrapped_on_complete() -> None: summary_dict=task.summary_dict, ctx=task.ctx, semantic_msg_id=task.semantic_msg_id, + use_summary=task.use_summary, ) ) else: @@ -432,6 +434,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None: try: if need_vectorize: + use_summary = bool(summary_dict.get("summary")) task = VectorizeTask( task_type="file", uri=file_path, @@ -441,6 +444,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None: file_path=file_path, summary_dict=summary_dict, parent_uri=parent_uri, + use_summary=use_summary, ) await self._add_vectorize_task(task) except Exception as e: diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 2b92b73b..f767a580 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -651,6 +651,9 @@ async def _generate_text_summary( verbose = code_mode == "ast_llm" skeleton_text = extract_skeleton(file_name, content, verbose=verbose) if skeleton_text: + max_skeleton_chars = get_openviking_config().semantic.max_skeleton_chars + if len(skeleton_text) > max_skeleton_chars: + skeleton_text = skeleton_text[:max_skeleton_chars] if code_mode == "ast": return {"name": file_name, "summary": skeleton_text} else: # ast_llm @@ -1069,6 +1072,7 @@ async def _vectorize_single_file( summary_dict: Dict[str, str], ctx: Optional[RequestContext] = None, semantic_msg_id: Optional[str] = None, + use_summary: bool = False, ) -> None: """Vectorize a single file using its content or summary.""" from openviking.utils.embedding_utils import vectorize_file @@ -1081,4 +1085,5 @@ async def _vectorize_single_file( context_type=context_type, ctx=active_ctx, semantic_msg_id=semantic_msg_id, + use_summary=use_summary, ) diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index 9a2f9d2d..35aeadcc 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -213,12 +213,14 @@ async def vectorize_file( context_type: str = "resource", ctx: Optional[RequestContext] = None, semantic_msg_id: Optional[str] = None, + use_summary: bool = False, ) -> None: """ Vectorize a single file. Creates Context object for the file and enqueues it. - Reads content for TEXT files, otherwise uses summary. + If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario). + Otherwise reads raw file content for TEXT files, falls back to summary on failure. """ enqueued = False @@ -260,21 +262,27 @@ async def vectorize_file( ) return elif content_type == ResourceContentType.TEXT: - # For text files, try to read content - try: - content = await viking_fs.read_file(file_path, ctx=ctx) - if isinstance(content, bytes): - content = content.decode("utf-8", errors="replace") - context.set_vectorize(Vectorize(text=content)) - except Exception as e: - logger.warning( - f"Failed to read file content for {file_path}, falling back to summary: {e}" - ) - if summary: - context.set_vectorize(Vectorize(text=summary)) - else: - logger.warning(f"No summary available for {file_path}, skipping vectorization") - return + if use_summary and summary: + # Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding + context.set_vectorize(Vectorize(text=summary)) + else: + # Default: read raw file content + try: + content = await viking_fs.read_file(file_path, ctx=ctx) + if isinstance(content, bytes): + content = content.decode("utf-8", errors="replace") + context.set_vectorize(Vectorize(text=content)) + except Exception as e: + logger.warning( + f"Failed to read file content for {file_path}, falling back to summary: {e}" + ) + if summary: + context.set_vectorize(Vectorize(text=summary)) + else: + logger.warning( + f"No summary available for {file_path}, skipping vectorization" + ) + return elif summary: # For non-text files, use summary context.set_vectorize(Vectorize(text=summary)) diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py index c8ff46aa..216cdad4 100644 --- a/openviking_cli/utils/config/parser_config.py +++ b/openviking_cli/utils/config/parser_config.py @@ -492,6 +492,9 @@ class SemanticConfig: max_file_content_chars: int = 30000 """Maximum characters of file content sent to LLM for summary generation.""" + max_skeleton_chars: int = 12000 + """Maximum characters of AST skeleton used for embedding (~3000 tokens).""" + max_overview_prompt_chars: int = 60000 """Maximum characters allowed in the overview generation prompt. If exceeded, file summaries are batched and merged.""" From 61d03fc952b02f4b1f69e734b239e48cad73e3ed Mon Sep 17 00:00:00 2001 From: "yangxinxin.24" Date: Thu, 19 Mar 2026 17:48:12 +0800 Subject: [PATCH 2/2] fix(embedding): only use summary for code repo embedding, not plain text/doc files Add `is_code_repo` flag to `SemanticMsg` and propagate it through the pipeline so that summary-based embedding (AST skeleton) is only applied when processing a code repository (`source_format == "repository"`). For plain text, markdown, and other non-repo resources, raw file content is used for embedding as before. Co-Authored-By: Claude Sonnet 4.6 --- openviking/storage/queuefs/semantic_dag.py | 4 +++- openviking/storage/queuefs/semantic_msg.py | 4 ++++ openviking/storage/queuefs/semantic_processor.py | 1 + openviking/utils/resource_processor.py | 2 ++ openviking/utils/summarizer.py | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 0b9c1338..55cacc69 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -77,6 +77,7 @@ def __init__( semantic_msg_id: Optional[str] = None, recursive: bool = True, lifecycle_lock_handle_id: str = "", + is_code_repo: bool = False, ): self._processor = processor self._context_type = context_type @@ -87,6 +88,7 @@ def __init__( self._semantic_msg_id = semantic_msg_id self._recursive = recursive self._lifecycle_lock_handle_id = lifecycle_lock_handle_id + self._is_code_repo = is_code_repo self._llm_sem = asyncio.Semaphore(max_concurrent_llm) self._viking_fs = get_viking_fs() self._nodes: Dict[str, DirNode] = {} @@ -434,7 +436,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None: try: if need_vectorize: - use_summary = bool(summary_dict.get("summary")) + use_summary = self._is_code_repo and bool(summary_dict.get("summary")) task = VectorizeTask( task_type="file", uri=file_path, diff --git a/openviking/storage/queuefs/semantic_msg.py b/openviking/storage/queuefs/semantic_msg.py index 720948e8..fbd56647 100644 --- a/openviking/storage/queuefs/semantic_msg.py +++ b/openviking/storage/queuefs/semantic_msg.py @@ -40,6 +40,7 @@ class SemanticMsg: telemetry_id: str = "" target_uri: str = "" lifecycle_lock_handle_id: str = "" + is_code_repo: bool = False changes: Optional[Dict[str, List[str]]] = ( None # {"added": [...], "modified": [...], "deleted": [...]} ) @@ -57,6 +58,7 @@ def __init__( telemetry_id: str = "", target_uri: str = "", lifecycle_lock_handle_id: str = "", + is_code_repo: bool = False, changes: Optional[Dict[str, List[str]]] = None, ): self.id = str(uuid4()) @@ -71,6 +73,7 @@ def __init__( self.telemetry_id = telemetry_id self.target_uri = target_uri self.lifecycle_lock_handle_id = lifecycle_lock_handle_id + self.is_code_repo = is_code_repo self.changes = changes def to_dict(self) -> Dict[str, Any]: @@ -110,6 +113,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SemanticMsg": telemetry_id=data.get("telemetry_id", ""), target_uri=data.get("target_uri", ""), lifecycle_lock_handle_id=data.get("lifecycle_lock_handle_id", ""), + is_code_repo=data.get("is_code_repo", False), changes=data.get("changes"), ) if "id" in data and data["id"]: diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index f767a580..cf164177 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -264,6 +264,7 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, semantic_msg_id=msg.id, recursive=msg.recursive, lifecycle_lock_handle_id=msg.lifecycle_lock_handle_id, + is_code_repo=msg.is_code_repo, ) self._dag_executor = executor await executor.run(msg.uri) diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index ef6185dc..9dd61f97 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -265,6 +265,7 @@ async def process_resource( should_summarize = summarize or build_index if should_summarize: skip_vec = not build_index + is_code_repo = parse_result.source_format == "repository" try: with telemetry.measure("resource.summarize"): await self._get_summarizer().summarize( @@ -273,6 +274,7 @@ async def process_resource( skip_vectorization=skip_vec, lifecycle_lock_handle_id=lifecycle_lock_handle_id, temp_uris=[temp_uri_for_summarize], + is_code_repo=is_code_repo, **kwargs, ) except Exception as e: diff --git a/openviking/utils/summarizer.py b/openviking/utils/summarizer.py index e9a1cb20..dff099e6 100644 --- a/openviking/utils/summarizer.py +++ b/openviking/utils/summarizer.py @@ -74,6 +74,7 @@ async def summarize( telemetry_id=telemetry.telemetry_id if telemetry.enabled else "", target_uri=uri if uri != temp_uri else None, lifecycle_lock_handle_id=lifecycle_lock_handle_id, + is_code_repo=kwargs.get("is_code_repo", False), ) await semantic_queue.enqueue(msg) enqueued_count += 1