Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions openviking/storage/queuefs/semantic_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class VectorizeTask:
file_path: Optional[str] = None
summary_dict: Optional[Dict[str, str]] = None
parent_uri: Optional[str] = None
use_summary: bool = False
# For directory tasks
abstract: Optional[str] = None
overview: Optional[str] = None
Expand All @@ -76,6 +77,7 @@ def __init__(
semantic_msg_id: Optional[str] = None,
recursive: bool = True,
lifecycle_lock_handle_id: str = "",
is_code_repo: bool = False,
):
self._processor = processor
self._context_type = context_type
Expand All @@ -86,6 +88,7 @@ def __init__(
self._semantic_msg_id = semantic_msg_id
self._recursive = recursive
self._lifecycle_lock_handle_id = lifecycle_lock_handle_id
self._is_code_repo = is_code_repo
self._llm_sem = asyncio.Semaphore(max_concurrent_llm)
self._viking_fs = get_viking_fs()
self._nodes: Dict[str, DirNode] = {}
Expand Down Expand Up @@ -192,6 +195,7 @@ async def wrapped_on_complete() -> None:
summary_dict=task.summary_dict,
ctx=task.ctx,
semantic_msg_id=task.semantic_msg_id,
use_summary=task.use_summary,
)
)
else:
Expand Down Expand Up @@ -432,6 +436,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None:

try:
if need_vectorize:
use_summary = self._is_code_repo and bool(summary_dict.get("summary"))
task = VectorizeTask(
task_type="file",
uri=file_path,
Expand All @@ -441,6 +446,7 @@ async def _file_summary_task(self, parent_uri: str, file_path: str) -> None:
file_path=file_path,
summary_dict=summary_dict,
parent_uri=parent_uri,
use_summary=use_summary,
)
await self._add_vectorize_task(task)
except Exception as e:
Expand Down
4 changes: 4 additions & 0 deletions openviking/storage/queuefs/semantic_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class SemanticMsg:
telemetry_id: str = ""
target_uri: str = ""
lifecycle_lock_handle_id: str = ""
is_code_repo: bool = False
changes: Optional[Dict[str, List[str]]] = (
None # {"added": [...], "modified": [...], "deleted": [...]}
)
Expand All @@ -57,6 +58,7 @@ def __init__(
telemetry_id: str = "",
target_uri: str = "",
lifecycle_lock_handle_id: str = "",
is_code_repo: bool = False,
changes: Optional[Dict[str, List[str]]] = None,
):
self.id = str(uuid4())
Expand All @@ -71,6 +73,7 @@ def __init__(
self.telemetry_id = telemetry_id
self.target_uri = target_uri
self.lifecycle_lock_handle_id = lifecycle_lock_handle_id
self.is_code_repo = is_code_repo
self.changes = changes

def to_dict(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -110,6 +113,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SemanticMsg":
telemetry_id=data.get("telemetry_id", ""),
target_uri=data.get("target_uri", ""),
lifecycle_lock_handle_id=data.get("lifecycle_lock_handle_id", ""),
is_code_repo=data.get("is_code_repo", False),
changes=data.get("changes"),
)
if "id" in data and data["id"]:
Expand Down
6 changes: 6 additions & 0 deletions openviking/storage/queuefs/semantic_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str,
semantic_msg_id=msg.id,
recursive=msg.recursive,
lifecycle_lock_handle_id=msg.lifecycle_lock_handle_id,
is_code_repo=msg.is_code_repo,
)
self._dag_executor = executor
await executor.run(msg.uri)
Expand Down Expand Up @@ -651,6 +652,9 @@ async def _generate_text_summary(
verbose = code_mode == "ast_llm"
skeleton_text = extract_skeleton(file_name, content, verbose=verbose)
if skeleton_text:
max_skeleton_chars = get_openviking_config().semantic.max_skeleton_chars
if len(skeleton_text) > max_skeleton_chars:
skeleton_text = skeleton_text[:max_skeleton_chars]
if code_mode == "ast":
return {"name": file_name, "summary": skeleton_text}
else: # ast_llm
Expand Down Expand Up @@ -1069,6 +1073,7 @@ async def _vectorize_single_file(
summary_dict: Dict[str, str],
ctx: Optional[RequestContext] = None,
semantic_msg_id: Optional[str] = None,
use_summary: bool = False,
) -> None:
"""Vectorize a single file using its content or summary."""
from openviking.utils.embedding_utils import vectorize_file
Expand All @@ -1081,4 +1086,5 @@ async def _vectorize_single_file(
context_type=context_type,
ctx=active_ctx,
semantic_msg_id=semantic_msg_id,
use_summary=use_summary,
)
40 changes: 24 additions & 16 deletions openviking/utils/embedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,14 @@ async def vectorize_file(
context_type: str = "resource",
ctx: Optional[RequestContext] = None,
semantic_msg_id: Optional[str] = None,
use_summary: bool = False,
) -> None:
"""
Vectorize a single file.

Creates Context object for the file and enqueues it.
Reads content for TEXT files, otherwise uses summary.
If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario).
Otherwise reads raw file content for TEXT files, falls back to summary on failure.
"""
enqueued = False

Expand Down Expand Up @@ -260,21 +262,27 @@ async def vectorize_file(
)
return
elif content_type == ResourceContentType.TEXT:
# For text files, try to read content
try:
content = await viking_fs.read_file(file_path, ctx=ctx)
if isinstance(content, bytes):
content = content.decode("utf-8", errors="replace")
context.set_vectorize(Vectorize(text=content))
except Exception as e:
logger.warning(
f"Failed to read file content for {file_path}, falling back to summary: {e}"
)
if summary:
context.set_vectorize(Vectorize(text=summary))
else:
logger.warning(f"No summary available for {file_path}, skipping vectorization")
return
if use_summary and summary:
# Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding
context.set_vectorize(Vectorize(text=summary))
else:
# Default: read raw file content
try:
content = await viking_fs.read_file(file_path, ctx=ctx)
if isinstance(content, bytes):
content = content.decode("utf-8", errors="replace")
context.set_vectorize(Vectorize(text=content))
except Exception as e:
logger.warning(
f"Failed to read file content for {file_path}, falling back to summary: {e}"
)
if summary:
context.set_vectorize(Vectorize(text=summary))
else:
logger.warning(
f"No summary available for {file_path}, skipping vectorization"
)
return
elif summary:
# For non-text files, use summary
context.set_vectorize(Vectorize(text=summary))
Expand Down
2 changes: 2 additions & 0 deletions openviking/utils/resource_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ async def process_resource(
should_summarize = summarize or build_index
if should_summarize:
skip_vec = not build_index
is_code_repo = parse_result.source_format == "repository"
try:
with telemetry.measure("resource.summarize"):
await self._get_summarizer().summarize(
Expand All @@ -273,6 +274,7 @@ async def process_resource(
skip_vectorization=skip_vec,
lifecycle_lock_handle_id=lifecycle_lock_handle_id,
temp_uris=[temp_uri_for_summarize],
is_code_repo=is_code_repo,
**kwargs,
)
except Exception as e:
Expand Down
1 change: 1 addition & 0 deletions openviking/utils/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ async def summarize(
telemetry_id=telemetry.telemetry_id if telemetry.enabled else "",
target_uri=uri if uri != temp_uri else None,
lifecycle_lock_handle_id=lifecycle_lock_handle_id,
is_code_repo=kwargs.get("is_code_repo", False),
)
await semantic_queue.enqueue(msg)
enqueued_count += 1
Expand Down
3 changes: 3 additions & 0 deletions openviking_cli/utils/config/parser_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,9 @@ class SemanticConfig:
max_file_content_chars: int = 30000
"""Maximum characters of file content sent to LLM for summary generation."""

max_skeleton_chars: int = 12000
"""Maximum characters of AST skeleton used for embedding (~3000 tokens)."""

max_overview_prompt_chars: int = 60000
"""Maximum characters allowed in the overview generation prompt.
If exceeded, file summaries are batched and merged."""
Expand Down
Loading