From 5d85536707bbf7d451875c055d777ee8b0b7817b Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 11:49:00 +0800 Subject: [PATCH 01/18] fix: make rust CLI (ov) commands match python CLI (openviking) exactly - add top-level wait/status/health commands --- crates/ov_cli/src/main.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index 0d2ac0ce..e164974e 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -125,6 +125,16 @@ enum Commands { #[arg(long)] no_vectorize: bool, }, + /// Wait for queued async processing to complete + Wait { + /// Wait timeout in seconds + #[arg(long)] + timeout: Option, + }, + /// Show OpenViking component status + Status, + /// Quick health check + Health, /// System utility commands System { #[command(subcommand)] @@ -363,6 +373,15 @@ async fn main() { Commands::Import { file_path, target_uri, force, no_vectorize } => { handle_import(file_path, target_uri, force, no_vectorize, ctx).await } + Commands::Wait { timeout } => { + let client = ctx.get_client(); + commands::system::wait(&client, timeout, ctx.output_format, ctx.compact).await + }, + Commands::Status => { + let client = ctx.get_client(); + commands::observer::system(&client, ctx.output_format, ctx.compact).await + }, + Commands::Health => handle_health(ctx).await, Commands::System { action } => handle_system(action, ctx).await, Commands::Observer { action } => handle_observer(action, ctx).await, Commands::Session { action } => handle_session(action, ctx).await, @@ -651,3 +670,14 @@ async fn handle_glob(pattern: String, uri: String, ctx: CliContext) -> Result<() let client = ctx.get_client(); commands::search::glob(&client, &pattern, &uri, ctx.output_format, ctx.compact).await } + +async fn handle_health(ctx: CliContext) -> Result<()> { + let client = ctx.get_client(); + let system_status: serde_json::Value = client.get("/api/v1/observer/system", &[]).await?; + let is_healthy = system_status.get("is_healthy").and_then(|v| v.as_bool()).unwrap_or(false); + output::output_success(&serde_json::json!({ "healthy": is_healthy }), ctx.output_format, ctx.compact); + if !is_healthy { + std::process::exit(1); + } + Ok(()) +} From 0e08354febd487cd2e3b874102a1ae97ae31c4d8 Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 12:32:43 +0800 Subject: [PATCH 02/18] fix: ov cli plays same as py cli (ls, tree) --- README.md | 2 +- README_CN.md | 2 +- crates/ov_cli/src/client.rs | 16 ++++++++-- crates/ov_cli/src/commands/filesystem.rs | 12 ++++++-- crates/ov_cli/src/main.rs | 36 ++++++++++++++++++----- openviking/async_client.py | 7 ++++- openviking/client/local.py | 7 ++++- openviking/server/routers/filesystem.py | 9 +++++- openviking/service/fs_service.py | 15 ++++++++-- openviking/storage/viking_fs.py | 23 +++++++++++---- openviking_cli/cli/commands/filesystem.py | 13 +++++++- openviking_cli/client/base.py | 2 ++ openviking_cli/client/http.py | 4 +++ openviking_cli/client/sync_http.py | 11 +++++-- 14 files changed, 130 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index b8107d29..8f674818 100644 --- a/README.md +++ b/README.md @@ -581,7 +581,7 @@ Let's work together to define and build the future of AI Agent context managemen ### Star Trend -[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=Timeline)](https://www.star-history.com/#volcengine/OpenViking&Timeline) +[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=timeline&legend=top-left)](https://www.star-history.com/#volcengine/OpenViking&type=timeline&legend=top-left) --- diff --git a/README_CN.md b/README_CN.md index 0f2e76ca..c09354ed 100644 --- a/README_CN.md +++ b/README_CN.md @@ -457,7 +457,7 @@ OpenViking 目前还处于早期阶段,有许多需要完善和探索的地方 ### Star 趋势 -[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=Timeline)](https://www.star-history.com/#volcengine/OpenViking&Timeline) +[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=timeline&legend=top-left)](https://www.star-history.com/#volcengine/OpenViking&type=timeline&legend=top-left) --- diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index 53dc3b7b..2c2bcc8b 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -191,17 +191,27 @@ impl HttpClient { // ============ Filesystem Methods ============ - pub async fn ls(&self, uri: &str, simple: bool, recursive: bool) -> Result { + pub async fn ls(&self, uri: &str, simple: bool, recursive: bool, output: &str, abs_limit: i32, show_all_hidden: bool, node_limit: i32) -> Result { let params = vec![ ("uri".to_string(), uri.to_string()), ("simple".to_string(), simple.to_string()), ("recursive".to_string(), recursive.to_string()), + ("output".to_string(), output.to_string()), + ("abs_limit".to_string(), abs_limit.to_string()), + ("show_all_hidden".to_string(), show_all_hidden.to_string()), + ("node_limit".to_string(), node_limit.to_string()), ]; self.get("/api/v1/fs/ls", ¶ms).await } - pub async fn tree(&self, uri: &str) -> Result { - let params = vec![("uri".to_string(), uri.to_string())]; + pub async fn tree(&self, uri: &str, output: &str, abs_limit: i32, show_all_hidden: bool, node_limit: i32) -> Result { + let params = vec![ + ("uri".to_string(), uri.to_string()), + ("output".to_string(), output.to_string()), + ("abs_limit".to_string(), abs_limit.to_string()), + ("show_all_hidden".to_string(), show_all_hidden.to_string()), + ("node_limit".to_string(), node_limit.to_string()), + ]; self.get("/api/v1/fs/tree", ¶ms).await } diff --git a/crates/ov_cli/src/commands/filesystem.rs b/crates/ov_cli/src/commands/filesystem.rs index 0034c1a0..b281f1c7 100644 --- a/crates/ov_cli/src/commands/filesystem.rs +++ b/crates/ov_cli/src/commands/filesystem.rs @@ -7,10 +7,14 @@ pub async fn ls( uri: &str, simple: bool, recursive: bool, + output: &str, + abs_limit: i32, + show_all_hidden: bool, + node_limit: i32, output_format: OutputFormat, compact: bool, ) -> Result<()> { - let result = client.ls(uri, simple, recursive).await?; + let result = client.ls(uri, simple, recursive, output, abs_limit, show_all_hidden, node_limit).await?; output_success(&result, output_format, compact); Ok(()) } @@ -18,10 +22,14 @@ pub async fn ls( pub async fn tree( client: &HttpClient, uri: &str, + output: &str, + abs_limit: i32, + show_all_hidden: bool, + node_limit: i32, output_format: OutputFormat, compact: bool, ) -> Result<()> { - let result = client.tree(uri).await?; + let result = client.tree(uri, output, abs_limit, show_all_hidden, node_limit).await?; output_success(&result, output_format, compact); Ok(()) } diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index e164974e..b98aaf4c 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -162,11 +162,29 @@ enum Commands { /// List all subdirectories recursively #[arg(short, long)] recursive: bool, + /// Abstract content limit (only for agent output) + #[arg(long = "abs-limit", short = 'l', default_value = "256")] + abs_limit: i32, + /// Show all hidden files + #[arg(short, long)] + all: bool, + /// Maximum number of nodes to list + #[arg(long = "node-limit", short = 'n', default_value = "1000")] + node_limit: i32, }, /// Get directory tree Tree { /// Viking URI to get tree for uri: String, + /// Abstract content limit (only for agent output) + #[arg(long = "abs-limit", short = 'l', default_value = "128")] + abs_limit: i32, + /// Show all hidden files + #[arg(short, long)] + all: bool, + /// Maximum number of nodes to list + #[arg(long = "node-limit", short = 'n', default_value = "1000")] + node_limit: i32, }, /// Create directory Mkdir { @@ -385,11 +403,11 @@ async fn main() { Commands::System { action } => handle_system(action, ctx).await, Commands::Observer { action } => handle_observer(action, ctx).await, Commands::Session { action } => handle_session(action, ctx).await, - Commands::Ls { uri, simple, recursive } => { - handle_ls(uri, simple, recursive, ctx).await + Commands::Ls { uri, simple, recursive, abs_limit, all, node_limit } => { + handle_ls(uri, simple, recursive, abs_limit, all, node_limit, ctx).await } - Commands::Tree { uri } => { - handle_tree(uri, ctx).await + Commands::Tree { uri, abs_limit, all, node_limit } => { + handle_tree(uri, abs_limit, all, node_limit, ctx).await } Commands::Mkdir { uri } => { handle_mkdir(uri, ctx).await @@ -631,14 +649,16 @@ async fn handle_search( commands::search::search(&client, &query, &uri, session_id, limit, threshold, ctx.output_format, ctx.compact).await } -async fn handle_ls(uri: String, simple: bool, recursive: bool, ctx: CliContext) -> Result<()> { +async fn handle_ls(uri: String, simple: bool, recursive: bool, abs_limit: i32, show_all_hidden: bool, node_limit: i32, ctx: CliContext) -> Result<()> { let client = ctx.get_client(); - commands::filesystem::ls(&client, &uri, simple, recursive, ctx.output_format, ctx.compact).await + let api_output = if ctx.compact { "agent" } else { "original" }; + commands::filesystem::ls(&client, &uri, simple, recursive, api_output, abs_limit, show_all_hidden, node_limit, ctx.output_format, ctx.compact).await } -async fn handle_tree(uri: String, ctx: CliContext) -> Result<()> { +async fn handle_tree(uri: String, abs_limit: i32, show_all_hidden: bool, node_limit: i32, ctx: CliContext) -> Result<()> { let client = ctx.get_client(); - commands::filesystem::tree(&client, &uri, ctx.output_format, ctx.compact).await + let api_output = if ctx.compact { "agent" } else { "original" }; + commands::filesystem::tree(&client, &uri, api_output, abs_limit, show_all_hidden, node_limit, ctx.output_format, ctx.compact).await } async fn handle_mkdir(uri: String, ctx: CliContext) -> Result<()> { diff --git a/openviking/async_client.py b/openviking/async_client.py index b4f48ed5..84e3c7e1 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -306,8 +306,13 @@ async def tree(self, uri: str, **kwargs) -> Dict: output = kwargs.get("output", "original") abs_limit = kwargs.get("abs_limit", 128) show_all_hidden = kwargs.get("show_all_hidden", True) + node_limit = kwargs.get("node_limit", 1000) return await self._client.tree( - uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ) async def mkdir(self, uri: str) -> None: diff --git a/openviking/client/local.py b/openviking/client/local.py index 46acca99..389439ac 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -113,10 +113,15 @@ async def tree( output: str = "original", abs_limit: int = 128, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Dict[str, Any]]: """Get directory tree.""" return await self._service.fs.tree( - uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ) async def stat(self, uri: str) -> Dict[str, Any]: diff --git a/openviking/server/routers/filesystem.py b/openviking/server/routers/filesystem.py index 5e71fe65..a24e64c6 100644 --- a/openviking/server/routers/filesystem.py +++ b/openviking/server/routers/filesystem.py @@ -22,6 +22,7 @@ async def ls( output: str = Query("agent", description="Output format: original or agent"), abs_limit: int = Query(256, description="Abstract limit (only for agent output)"), show_all_hidden: bool = Query(False, description="List all hidden files, like -a"), + node_limit: int = Query(1000, description="Maximum number of nodes to list"), _: bool = Depends(verify_api_key), ): """List directory contents.""" @@ -33,6 +34,7 @@ async def ls( output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden, + node_limit=node_limit, ) return Response(status="ok", result=result) @@ -43,12 +45,17 @@ async def tree( output: str = Query("agent", description="Output format: original or agent"), abs_limit: int = Query(256, description="Abstract limit (only for agent output)"), show_all_hidden: bool = Query(False, description="List all hidden files, like -a"), + node_limit: int = Query(1000, description="Maximum number of nodes to list"), _: bool = Depends(verify_api_key), ): """Get directory tree.""" service = get_service() result = await service.fs.tree( - uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ) return Response(status="ok", result=result) diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index 72e5526d..598e0fc0 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -39,6 +39,7 @@ async def ls( output: str = "original", abs_limit: int = 256, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Any]: """List directory contents. @@ -49,12 +50,17 @@ async def ls( output: str = "original" or "agent" abs_limit: int = 256 if output == "agent" else ignore show_all_hidden: bool = False (list all hidden files, like -a) + node_limit: int = 1000 (maximum number of nodes to list) """ viking_fs = self._ensure_initialized() if recursive: entries = await viking_fs.tree( - uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ) else: entries = await viking_fs.ls( @@ -86,11 +92,16 @@ async def tree( output: str = "original", abs_limit: int = 128, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Dict[str, Any]]: """Get directory tree.""" viking_fs = self._ensure_initialized() return await viking_fs.tree( - uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ) async def stat(self, uri: str) -> Dict[str, Any]: diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 3f3f173e..a8e52c72 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -201,9 +201,9 @@ async def stat(self, uri: str) -> Dict[str, Any]: path = self._uri_to_path(uri) return self.agfs.stat(path) - async def glob(self, pattern: str, uri: str = "viking://") -> Dict: + async def glob(self, pattern: str, uri: str = "viking://", node_limit: int = 1000) -> Dict: """File pattern matching, supports **/*.md recursive.""" - entries = await self.tree(uri) + entries = await self.tree(uri, node_limit=node_limit) base_uri = uri.rstrip("/") matches = [] for entry in entries: @@ -248,6 +248,7 @@ async def tree( output: str = "original", abs_limit: int = 256, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Dict[str, Any]]: """ Recursively list all contents (includes rel_path). @@ -265,19 +266,25 @@ async def tree( [{'name': '.abstract.md', 'size': 100, 'modTime': '2026-02-11 16:52:16', 'isDir': False, 'rel_path': '.abstract.md', 'uri': 'viking://resources...', 'abstract': "..."}] """ if output == "original": - return await self._tree_original(uri, show_all_hidden) + return await self._tree_original(uri, show_all_hidden, node_limit) elif output == "agent": - return await self._tree_agent(uri, abs_limit, show_all_hidden) + return await self._tree_agent(uri, abs_limit, show_all_hidden, node_limit) else: raise ValueError(f"Invalid output format: {output}") - async def _tree_original(self, uri: str, show_all_hidden: bool = False) -> List[Dict[str, Any]]: + async def _tree_original( + self, uri: str, show_all_hidden: bool = False, node_limit: int = 1000 + ) -> List[Dict[str, Any]]: """Recursively list all contents (original format).""" path = self._uri_to_path(uri) all_entries = [] async def _walk(current_path: str, current_rel: str): + if len(all_entries) >= node_limit: + return for entry in self.agfs.ls(current_path): + if len(all_entries) >= node_limit: + break name = entry.get("name", "") if name in [".", ".."]: continue @@ -297,7 +304,7 @@ async def _walk(current_path: str, current_rel: str): return all_entries async def _tree_agent( - self, uri: str, abs_limit: int, show_all_hidden: bool = False + self, uri: str, abs_limit: int, show_all_hidden: bool = False, node_limit: int = 1000 ) -> List[Dict[str, Any]]: """Recursively list all contents (agent format with abstracts).""" path = self._uri_to_path(uri) @@ -305,7 +312,11 @@ async def _tree_agent( now = datetime.now() async def _walk(current_path: str, current_rel: str): + if len(all_entries) >= node_limit: + return for entry in self.agfs.ls(current_path): + if len(all_entries) >= node_limit: + break name = entry.get("name", "") if name in [".", ".."]: continue diff --git a/openviking_cli/cli/commands/filesystem.py b/openviking_cli/cli/commands/filesystem.py index 9b776562..27c7c44a 100644 --- a/openviking_cli/cli/commands/filesystem.py +++ b/openviking_cli/cli/commands/filesystem.py @@ -26,6 +26,9 @@ def ls_command( ), abs_limit: int = typer.Option(256, "--abs-limit", "-l", help="Abstract content limit"), show_all_hidden: bool = typer.Option(False, "--all", "-a", help="Show all hidden files"), + node_limit: int = typer.Option( + 1000, "--node-limit", "-n", help="Maximum number of nodes to list" + ), ) -> None: """List directory contents.""" run( @@ -37,6 +40,7 @@ def ls_command( output=output_format, abs_limit=abs_limit, show_all_hidden=show_all_hidden, + node_limit=node_limit, ), ) @@ -49,6 +53,9 @@ def tree_command( ), abs_limit: int = typer.Option(128, "--abs-limit", "-l", help="Abstract content limit"), show_all_hidden: bool = typer.Option(False, "--all", "-a", help="Show all hidden files"), + node_limit: int = typer.Option( + 1000, "--node-limit", "-n", help="Maximum number of nodes to list" + ), ) -> None: """ Get directory tree info. @@ -56,7 +63,11 @@ def tree_command( run( ctx, lambda client: client.tree( - uri, output=output_format, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output_format, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ), ) diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py index b72d4f3a..7882b585 100644 --- a/openviking_cli/client/base.py +++ b/openviking_cli/client/base.py @@ -68,6 +68,7 @@ async def ls( output: str = "original", abs_limit: int = 256, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Any]: """List directory contents.""" ... @@ -79,6 +80,7 @@ async def tree( output: str = "original", abs_limit: int = 128, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Dict[str, Any]]: """Get directory tree.""" ... diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index a8ae3726..39526602 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -278,6 +278,7 @@ async def ls( output: str = "original", abs_limit: int = 256, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Any]: """List directory contents.""" response = await self._http.get( @@ -289,6 +290,7 @@ async def ls( "output": output, "abs_limit": abs_limit, "show_all_hidden": show_all_hidden, + "node_limit": node_limit, }, ) return self._handle_response(response) @@ -299,6 +301,7 @@ async def tree( output: str = "original", abs_limit: int = 128, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Dict[str, Any]]: """Get directory tree.""" response = await self._http.get( @@ -308,6 +311,7 @@ async def tree( "output": output, "abs_limit": abs_limit, "show_all_hidden": show_all_hidden, + "node_limit": node_limit, }, ) return self._handle_response(response) diff --git a/openviking_cli/client/sync_http.py b/openviking_cli/client/sync_http.py index fdfa450f..97a9e0f2 100644 --- a/openviking_cli/client/sync_http.py +++ b/openviking_cli/client/sync_http.py @@ -160,6 +160,7 @@ def ls( output: str = "original", abs_limit: int = 256, show_all_hidden: bool = False, + node_limit: int = 1000, ) -> List[Any]: """List directory contents.""" return run_async( @@ -170,6 +171,7 @@ def ls( output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden, + node_limit=node_limit, ) ) @@ -179,11 +181,16 @@ def tree( output: str = "original", abs_limit: int = 128, show_all_hidden: bool = False, - ) -> Dict: + node_limit: int = 1000, + ) -> List[Dict[str, Any]]: """Get directory tree.""" return run_async( self._async_client.tree( - uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden + uri, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=node_limit, ) ) From 7376d61aabd1eefbdbf794de61ca81b467dc8c91 Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 13:24:35 +0800 Subject: [PATCH 03/18] Refactor media parsers to subdirectory structure with validation --- openviking/parse/parsers/README.md | 17 + openviking/parse/parsers/media/__init__.py | 8 + openviking/parse/parsers/media/audio.py | 313 ++++++++++++++++++ .../parsers/{media.py => media/image.py} | 275 +-------------- openviking/parse/parsers/media/video.py | 292 ++++++++++++++++ openviking/parse/registry.py | 10 +- openviking/parse/tree_builder.py | 38 ++- 7 files changed, 689 insertions(+), 264 deletions(-) create mode 100644 openviking/parse/parsers/media/__init__.py create mode 100644 openviking/parse/parsers/media/audio.py rename openviking/parse/parsers/{media.py => media/image.py} (53%) create mode 100644 openviking/parse/parsers/media/video.py diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md index b8bcefe0..7f452eb0 100644 --- a/openviking/parse/parsers/README.md +++ b/openviking/parse/parsers/README.md @@ -148,6 +148,23 @@ L1: """ 多媒体解析器,使用 VLM(视觉语言模型)分析图像、视频和音频内容,生成文本描述。 +对于添加多媒体文件的存储组织方式,我们采用以下策略: +* 在 viking://resource 下创建 images, audio, video 三个媒体子目录,分别是: + * viking://resource/images 用于存储提交时未明确指定目标路径的图片文件 + * viking://resource/audio 用于存储提交时未明确指定目标路径的音频文件 + * viking://resource/video 用于存储提交时未明确指定目标路径的视频文件 +* 对于每个媒体子目录下,每次上传的文件放在当前日期(而非文件内部元信息时间)的子目录下,例如: + * viking://resource/images/20240820/ 内存储 20240820 上传的所有图片文件 +* 对于每个多媒体文件,默认创建一个文件夹,文件夹名称与文件名想同但默认不包含后缀,例如: + * 上传文件 `20240820_123456.jpg` 后,默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456` 用于存储该文件的相关内容 + * 该文件夹内默认包含一个 `.abstract.md` 文件,用于存储该文件的摘要信息 + * 例如:图片文件的摘要可能是图片的文件名、内容描述、画面风格等,正常不应超过 200 token + * 该文件夹内默认包含一个 `.overview.md` 文件,用于存储该文件的概览内容,例如: + * 图片文件的概览内容除了包含 `.abstract.md` 中的内容,还可能包含图片的尺寸、画面风格、OCR 识别结果、场景和主体描述等 + * 音频文件的概览内容可能包含音频的文件名、时长、语音或歌词识别结果,以及主要的章节对应的时间线等 + * 视频文件的概览内容可能包含视频的文件名、时长、使用场景等,对于较大的视频,未来会对视频进行切分,可能会继续递归用子文件夹存储切分后的视频文件、音轨文件、关键画面的截图等,因此视频的处理逻辑预期较为复杂,可等待图片、音频实现后,参考文件夹或 zip 的递归处理形态进行处理。 + + ## 核心组件 ### BaseParser (`base_parser.py`) diff --git a/openviking/parse/parsers/media/__init__.py b/openviking/parse/parsers/media/__init__.py new file mode 100644 index 00000000..7fed46b5 --- /dev/null +++ b/openviking/parse/parsers/media/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +from .audio import AudioParser +from .image import ImageParser +from .video import VideoParser + +__all__ = ["ImageParser", "AudioParser", "VideoParser"] diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py new file mode 100644 index 00000000..372a0cab --- /dev/null +++ b/openviking/parse/parsers/media/audio.py @@ -0,0 +1,313 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Audio parser - Future implementation. + +Planned Features: +1. Speech-to-text transcription using ASR models +2. Audio metadata extraction (duration, sample rate, channels) +3. Speaker diarization (identify different speakers) +4. Timestamp alignment for transcribed text +5. Generate structured ResourceNode with transcript + +Example workflow: + 1. Load audio file + 2. Extract metadata (duration, format, sample rate) + 3. Transcribe speech to text using Whisper or similar + 4. (Optional) Perform speaker diarization + 5. Create ResourceNode with: + - type: NodeType.ROOT + - children: sections for each speaker/timestamp + - meta: audio metadata and timestamps + 6. Return ParseResult + +Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A +""" + +from pathlib import Path +from typing import List, Optional, Union + +from openviking.parse.base import NodeType, ParseResult, ResourceNode +from openviking.parse.parsers.base_parser import BaseParser +from openviking_cli.utils.config.parser_config import AudioConfig + + +class AudioParser(BaseParser): + """ + Audio parser for audio files. + """ + + def __init__(self, config: Optional[AudioConfig] = None, **kwargs): + """ + Initialize AudioParser. + + Args: + config: Audio parsing configuration + **kwargs: Additional configuration parameters + """ + self.config = config or AudioConfig() + + @property + def supported_extensions(self) -> List[str]: + """Return supported audio file extensions.""" + return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] + + async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: + """ + Parse audio file using three-phase architecture. + + Phase 1: Generate temporary files + - Copy original audio to temp_uri/content.{ext} + - Generate description.md using ASR + - (Optional) Generate transcript with timestamps + + Phase 2: Generate semantic info + - Generate abstract and overview based on description.md + - Overview includes file list and usage instructions + + Phase 3: Build directory structure + - Move all files to final URI + - Generate .abstract.md, .overview.md + + Args: + source: Audio file path + **kwargs: Additional parsing parameters + + Returns: + ParseResult with audio content + + Raises: + FileNotFoundError: If source file does not exist + IOError: If audio processing fails + """ + from openviking.storage.viking_fs import get_viking_fs + + # Convert to Path object + file_path = Path(source) if isinstance(source, str) else source + if not file_path.exists(): + raise FileNotFoundError(f"Audio file not found: {source}") + + viking_fs = get_viking_fs() + temp_uri = viking_fs.create_temp_uri() + + # Phase 1: Generate temporary files + audio_bytes = file_path.read_bytes() + ext = file_path.suffix + + root_dir_name = file_path.stem + root_dir_uri = f"{temp_uri}/{root_dir_name}" + await viking_fs.mkdir(root_dir_uri) + + # 1.1 Save original audio + await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", audio_bytes) + + # 1.2 Validate audio file using magic bytes + # Define magic bytes for supported audio formats + audio_magic_bytes = { + ".mp3": [b"ID3", b"\xff\xfb", b"\xff\xf3", b"\xff\xf2"], + ".wav": [b"RIFF"], + ".ogg": [b"OggS"], + ".flac": [b"fLaC"], + ".aac": [b"\xff\xf1", b"\xff\xf9"], + ".m4a": [b"\x00\x00\x00", b"ftypM4A", b"ftypisom"], + ".opus": [b"OggS"], + } + + # Check magic bytes + valid = False + ext_lower = ext.lower() + magic_list = audio_magic_bytes.get(ext_lower, []) + for magic in magic_list: + if len(audio_bytes) >= len(magic) and audio_bytes.startswith(magic): + valid = True + break + + if not valid: + raise ValueError( + f"Invalid audio file: {file_path}. File signature does not match expected format {ext_lower}" + ) + + # Extract audio metadata (placeholder) + duration = 0 + sample_rate = 0 + channels = 0 + format_str = ext[1:].upper() + + # 1.3 Generate ASR description + description = "" + if self.config.enable_transcription: + description = await self._asr_transcribe(audio_bytes, self.config.asr_model) + else: + # Fallback: basic description + description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)" + + await viking_fs.write_file(f"{root_dir_uri}/description.md", description) + + # 1.4 Transcript with timestamps (optional) + transcript_text = None + if self.config.enable_transcription and self.config.enable_timestamps: + transcript_text = await self._asr_transcribe_with_timestamps( + audio_bytes, self.config.asr_model + ) + if transcript_text: + await viking_fs.write_file(f"{root_dir_uri}/transcript.md", transcript_text) + + # Create ResourceNode + root_node = ResourceNode( + type=NodeType.ROOT, + title=file_path.stem, + level=0, + detail_file=None, + content_path=None, + children=[], + meta={ + "duration": duration, + "sample_rate": sample_rate, + "channels": channels, + "format": format_str.lower(), + "content_type": "audio", + "source_title": file_path.stem, + "semantic_name": file_path.stem, + }, + ) + + # Phase 2: Generate semantic info + await self._generate_semantic_info( + root_node, root_dir_uri, viking_fs, transcript_text is not None + ) + + # Phase 3: Build directory structure (handled by TreeBuilder) + return ParseResult( + root=root_node, + source_path=str(file_path), + temp_dir_path=temp_uri, + source_format="audio", + parser_name="AudioParser", + meta={"content_type": "audio", "format": format_str.lower()}, + ) + + async def _asr_transcribe(self, audio_bytes: bytes, model: Optional[str]) -> str: + """ + Generate audio transcription using ASR. + + Args: + audio_bytes: Audio binary data + model: ASR model name + + Returns: + Audio transcription in markdown format + + TODO: Integrate with actual ASR API (Whisper, etc.) + """ + # Fallback implementation - returns basic placeholder + return "Audio transcription (ASR integration pending)\n\nThis is an audio. ASR transcription feature has not yet integrated external API." + + async def _asr_transcribe_with_timestamps( + self, audio_bytes: bytes, model: Optional[str] + ) -> Optional[str]: + """ + Extract transcription with timestamps from audio using ASR. + + Args: + audio_bytes: Audio binary data + model: ASR model name + + Returns: + Transcript with timestamps in markdown format, or None if not available + + TODO: Integrate with ASR API + """ + # Not implemented - return None + return None + + async def _generate_semantic_info( + self, node: ResourceNode, temp_uri: str, viking_fs, has_transcript: bool + ): + """ + Phase 2: Generate abstract and overview. + + Args: + node: ResourceNode to update + temp_uri: Temporary URI + viking_fs: VikingFS instance + has_transcript: Whether transcript file exists + """ + # Read description.md + description = await viking_fs.read_file(f"{temp_uri}/description.md") + + # Generate abstract (short summary, < 100 tokens) + abstract = description[:200] if len(description) > 200 else description + + # Generate overview (content summary + file list + usage instructions) + overview_parts = [ + "## Content Summary\n", + description, + "\n\n## Available Files\n", + f"- content.{node.meta['format']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n", + "- description.md: Detailed audio transcription generated by ASR\n", + ] + + if has_transcript: + overview_parts.append("- transcript.md: Transcript with timestamps from the audio\n") + + overview_parts.append("\n## Usage\n") + overview_parts.append("### Play Audio\n") + overview_parts.append("```python\n") + overview_parts.append("audio_bytes = await audio_resource.play()\n") + overview_parts.append("# Returns: Audio file binary data\n") + overview_parts.append("# Purpose: Play or save the audio\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get ASR-generated Transcription\n") + overview_parts.append("```python\n") + overview_parts.append("transcription = await audio_resource.transcription()\n") + overview_parts.append("# Returns: FileContent object for further processing\n") + overview_parts.append("# Purpose: Understand audio content\n") + overview_parts.append("```\n\n") + + if has_transcript: + overview_parts.append("### Get Timestamps Transcript\n") + overview_parts.append("```python\n") + overview_parts.append("timestamps = await audio_resource.timestamps()\n") + overview_parts.append("# Returns: FileContent object or None\n") + overview_parts.append("# Purpose: Extract timestamped transcript from the audio\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get Audio Metadata\n") + overview_parts.append("```python\n") + overview_parts.append( + f"duration = audio_resource.get_duration() # {node.meta['duration']}s\n" + ) + overview_parts.append( + f"sample_rate = audio_resource.get_sample_rate() # {node.meta['sample_rate']}Hz\n" + ) + overview_parts.append( + f"channels = audio_resource.get_channels() # {node.meta['channels']}\n" + ) + overview_parts.append(f'format = audio_resource.get_format() # "{node.meta["format"]}"\n') + overview_parts.append("```\n") + + overview = "".join(overview_parts) + + # Store in node meta + node.meta["abstract"] = abstract + node.meta["overview"] = overview + + async def parse_content( + self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs + ) -> ParseResult: + """ + Parse audio from content string - Not yet implemented. + + Args: + content: Audio content (base64 or binary string) + source_path: Optional source path for metadata + **kwargs: Additional parsing parameters + + Returns: + ParseResult with audio content + + Raises: + NotImplementedError: This feature is not yet implemented + """ + raise NotImplementedError("Audio parsing from content not yet implemented") diff --git a/openviking/parse/parsers/media.py b/openviking/parse/parsers/media/image.py similarity index 53% rename from openviking/parse/parsers/media.py rename to openviking/parse/parsers/media/image.py index cb25a803..0965b730 100644 --- a/openviking/parse/parsers/media.py +++ b/openviking/parse/parsers/media/image.py @@ -17,7 +17,7 @@ from openviking.parse.base import NodeType, ParseResult, ResourceNode from openviking.parse.parsers.base_parser import BaseParser -from openviking_cli.utils.config.parser_config import AudioConfig, ImageConfig, VideoConfig +from openviking_cli.utils.config.parser_config import ImageConfig # ============================================================================= # Configuration Classes @@ -106,17 +106,23 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) image_bytes = file_path.read_bytes() ext = file_path.suffix + root_dir_name = file_path.stem + root_dir_uri = f"{temp_uri}/{root_dir_name}" + await viking_fs.mkdir(root_dir_uri) + # 1.1 Save original image - await viking_fs.write_file_bytes(f"{temp_uri}/content{ext}", image_bytes) + await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", image_bytes) - # 1.2 Extract image metadata + # 1.2 Validate and extract image metadata try: + img = Image.open(file_path) + img.verify() # Verify that it's a valid image + img.close() # Close and reopen to reset after verify() img = Image.open(file_path) width, height = img.size format_str = img.format or ext[1:].upper() - except Exception: - width, height = 0, 0 - format_str = ext[1:].upper() + except Exception as e: + raise ValueError(f"Invalid image file: {file_path}. Error: {e}") from e # 1.3 Generate VLM description description = "" @@ -126,14 +132,14 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) # Fallback: basic description description = f"Image file: {file_path.name} ({format_str}, {width}x{height})" - await viking_fs.write_file(f"{temp_uri}/description.md", description) + await viking_fs.write_file(f"{root_dir_uri}/description.md", description) # 1.4 OCR (optional) ocr_text = None if self.config.enable_ocr: ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang) if ocr_text: - await viking_fs.write_file(f"{temp_uri}/ocr.md", ocr_text) + await viking_fs.write_file(f"{root_dir_uri}/ocr.md", ocr_text) # Create ResourceNode root_node = ResourceNode( @@ -154,7 +160,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) ) # Phase 2: Generate semantic info - await self._generate_semantic_info(root_node, temp_uri, viking_fs, ocr_text is not None) + await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, ocr_text is not None) # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( @@ -283,254 +289,3 @@ async def parse_content( NotImplementedError: This feature is not yet implemented """ raise NotImplementedError("Image parsing not yet implemented") - - -class AudioParser(BaseParser): - """ - Audio parser - Future implementation. - - Planned Features: - 1. Speech-to-text transcription using ASR models - 2. Audio metadata extraction (duration, sample rate, channels) - 3. Speaker diarization (identify different speakers) - 4. Timestamp alignment for transcribed text - 5. Generate structured ResourceNode with transcript - - Example workflow: - 1. Load audio file - 2. Extract metadata (duration, format, sample rate) - 3. Transcribe speech to text using Whisper or similar - 4. (Optional) Perform speaker diarization - 5. Create ResourceNode with: - - type: NodeType.ROOT - - children: sections for each speaker/timestamp - - meta: audio metadata and timestamps - 6. Return ParseResult - - Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A - """ - - def __init__(self, config: Optional[AudioConfig] = None, **kwargs): - """ - Initialize AudioParser. - - Args: - config: Audio parsing configuration - **kwargs: Additional configuration parameters - """ - self.config = config or AudioConfig() - - @property - def supported_extensions(self) -> List[str]: - """Return supported audio file extensions.""" - return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] - - async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: - """ - Parse audio file - Not yet implemented. - - Planned implementation: - 1. Load audio file - 2. Extract metadata using librosa or similar - 3. If enable_transcription: - - Transcribe using Whisper or similar ASR model - - Generate timestamps for each segment - - (Optional) Perform speaker diarization - 4. Create ResourceNode tree: - - Root node with audio metadata - - Child nodes for each transcribed segment - 5. Return ParseResult - - Args: - source: Audio file path or URL - **kwargs: Additional parsing parameters - - Returns: - ParseResult with transcribed content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError( - "Audio parsing is not yet implemented. " - "This is a placeholder interface for future expansion. " - "\n\nPlanned features:" - "\n- Speech-to-text transcription (Whisper)" - "\n- Speaker diarization" - "\n- Timestamp alignment" - "\n- Audio metadata extraction" - "\n\nWorkaround: Extract audio manually and add transcripts as " - "text or markdown files." - ) - - async def parse_content( - self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs - ) -> ParseResult: - """ - Parse audio from content string - Not yet implemented. - - Args: - content: Audio content (base64 or binary string) - source_path: Optional source path for metadata - **kwargs: Additional parsing parameters - - Returns: - ParseResult with transcribed content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError("Audio parsing not yet implemented") - - -class VideoParser(BaseParser): - """ - Video parser - Future implementation. - - Planned Features: - 1. Key frame extraction at regular intervals - 2. Audio track transcription using ASR - 3. VLM-based scene description for key frames - 4. Video metadata extraction (duration, resolution, codec) - 5. Generate structured ResourceNode combining visual and audio - - Example workflow: - 1. Load video file - 2. Extract metadata (duration, resolution, fps) - 3. Extract audio track → transcribe using AudioParser - 4. Extract key frames at specified intervals - 5. For each frame: generate VLM description - 6. Create ResourceNode tree: - - Root: video metadata - - Children: timeline nodes (each with frame + transcript) - 7. Return ParseResult - - Supported formats: MP4, AVI, MOV, MKV, WEBM - """ - - def __init__(self, config: Optional[VideoConfig] = None, **kwargs): - """ - Initialize VideoParser. - - Args: - config: Video parsing configuration - **kwargs: Additional configuration parameters - """ - self.config = config or VideoConfig() - - @property - def supported_extensions(self) -> List[str]: - """Return supported video file extensions.""" - return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"] - - async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: - """ - Parse video file - Not yet implemented. - - Planned implementation: - 1. Load video file using cv2 or similar - 2. Extract metadata (duration, resolution, fps, codec) - 3. Extract audio track: - - Save as temporary audio file - - Parse using AudioParser - 4. Extract key frames: - - At specified intervals (e.g., every 10 seconds) - - Save frames as images - 5. For each frame (if enable_vlm_description): - - Use VLM to generate scene description - 6. Create ResourceNode tree: - - Root: video metadata - - Children: Timeline segments - - Each segment contains: - - Timestamp - - Frame description (VLM) - - Transcript (ASR) - 7. Return ParseResult - - Args: - source: Video file path or URL - **kwargs: Additional parsing parameters - - Returns: - ParseResult with video content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError( - "Video parsing is not yet implemented. " - "This is a placeholder interface for future expansion. " - "\n\nPlanned features:" - "\n- Key frame extraction" - "\n- Audio track transcription" - "\n- VLM scene description" - "\n- Timeline-based structured output" - "\n\nWorkaround: Extract frames and audio manually, then process " - "as images and audio files." - ) - - async def parse_content( - self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs - ) -> ParseResult: - """ - Parse video from content string - Not yet implemented. - - Args: - content: Video content (base64 or binary string) - source_path: Optional source path for metadata - **kwargs: Additional parsing parameters - - Returns: - ParseResult with video content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError("Video parsing not yet implemented") - - -# ============================================================================= -# Utility Functions -# ============================================================================= - - -def is_media_parser_available(parser_type: str) -> bool: - """ - Check if a media parser type is currently available. - - Args: - parser_type: Type of parser ("image", "audio", "video") - - Returns: - False (all media parsers are future implementations) - - Examples: - >>> is_media_parser_available("image") - False - >>> is_media_parser_available("video") - False - """ - return False - - -def get_media_parser_status() -> dict: - """ - Get status of all media parsers. - - Returns: - Dictionary with parser names and their implementation status - - Examples: - >>> status = get_media_parser_status() - >>> print(status) - { - "image": "planned", - "audio": "planned", - "video": "planned" - } - """ - return { - "image": "planned", - "audio": "planned", - "video": "planned", - } diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py new file mode 100644 index 00000000..807816e1 --- /dev/null +++ b/openviking/parse/parsers/media/video.py @@ -0,0 +1,292 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Video parser - Future implementation. + +Planned Features: +1. Key frame extraction at regular intervals +2. Audio track transcription using ASR +3. VLM-based scene description for key frames +4. Video metadata extraction (duration, resolution, codec) +5. Generate structured ResourceNode combining visual and audio + +Example workflow: + 1. Load video file + 2. Extract metadata (duration, resolution, fps) + 3. Extract audio track → transcribe using AudioParser + 4. Extract key frames at specified intervals + 5. For each frame: generate VLM description + 6. Create ResourceNode tree: + - Root: video metadata + - Children: timeline nodes (each with frame + transcript) + 7. Return ParseResult + +Supported formats: MP4, AVI, MOV, MKV, WEBM +""" + +from pathlib import Path +from typing import List, Optional, Union + +from openviking.parse.base import NodeType, ParseResult, ResourceNode +from openviking.parse.parsers.base_parser import BaseParser +from openviking_cli.utils.config.parser_config import VideoConfig + + +class VideoParser(BaseParser): + """ + Video parser for video files. + """ + + def __init__(self, config: Optional[VideoConfig] = None, **kwargs): + """ + Initialize VideoParser. + + Args: + config: Video parsing configuration + **kwargs: Additional configuration parameters + """ + self.config = config or VideoConfig() + + @property + def supported_extensions(self) -> List[str]: + """Return supported video file extensions.""" + return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"] + + async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: + """ + Parse video file using three-phase architecture. + + Phase 1: Generate temporary files + - Copy original video to temp_uri/content.{ext} + - Extract key frames + - Generate description.md for each frame using VLM + - Extract audio track and transcribe using ASR + + Phase 2: Generate semantic info + - Generate abstract and overview based on descriptions + - Overview includes file list and usage instructions + + Phase 3: Build directory structure + - Move all files to final URI + - Generate .abstract.md, .overview.md + + Args: + source: Video file path + **kwargs: Additional parsing parameters + + Returns: + ParseResult with video content + + Raises: + FileNotFoundError: If source file does not exist + IOError: If video processing fails + """ + from openviking.storage.viking_fs import get_viking_fs + + # Convert to Path object + file_path = Path(source) if isinstance(source, str) else source + if not file_path.exists(): + raise FileNotFoundError(f"Video file not found: {source}") + + viking_fs = get_viking_fs() + temp_uri = viking_fs.create_temp_uri() + + # Phase 1: Generate temporary files + video_bytes = file_path.read_bytes() + ext = file_path.suffix + + root_dir_name = file_path.stem + root_dir_uri = f"{temp_uri}/{root_dir_name}" + await viking_fs.mkdir(root_dir_uri) + + # 1.1 Save original video + await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", video_bytes) + + # 1.2 Validate video file using magic bytes + # Define magic bytes for supported video formats + video_magic_bytes = { + ".mp4": [b"\x00\x00\x00", b"ftyp"], + ".avi": [b"RIFF"], + ".mov": [b"\x00\x00\x00", b"ftyp"], + ".mkv": [b"\x1a\x45\xdf\xa3"], + ".webm": [b"\x1a\x45\xdf\xa3"], + ".flv": [b"FLV"], + ".wmv": [b"\x30\x26\xb2\x75\x8e\x66\xcf\x11"], + } + + # Check magic bytes + valid = False + ext_lower = ext.lower() + magic_list = video_magic_bytes.get(ext_lower, []) + for magic in magic_list: + if len(video_bytes) >= len(magic) and video_bytes.startswith(magic): + valid = True + break + + if not valid: + raise ValueError( + f"Invalid video file: {file_path}. File signature does not match expected format {ext_lower}" + ) + + # Extract video metadata (placeholder) + duration = 0 + width = 0 + height = 0 + fps = 0 + format_str = ext[1:].upper() + + # 1.3 Generate combined description + description = "" + if self.config.enable_key_frames or self.config.enable_audio_transcription: + description = await self._generate_video_description(file_path, self.config) + else: + # Fallback: basic description + description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)" + + await viking_fs.write_file(f"{root_dir_uri}/description.md", description) + + # 1.4 Key frames (optional) + key_frames_dir = f"{root_dir_uri}/keyframes" + has_key_frames = False + if self.config.enable_key_frames: + await viking_fs.mkdir(key_frames_dir) + has_key_frames = True + + # Create ResourceNode + root_node = ResourceNode( + type=NodeType.ROOT, + title=file_path.stem, + level=0, + detail_file=None, + content_path=None, + children=[], + meta={ + "duration": duration, + "width": width, + "height": height, + "fps": fps, + "format": format_str.lower(), + "content_type": "video", + "source_title": file_path.stem, + "semantic_name": file_path.stem, + }, + ) + + # Phase 2: Generate semantic info + await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, has_key_frames) + + # Phase 3: Build directory structure (handled by TreeBuilder) + return ParseResult( + root=root_node, + source_path=str(file_path), + temp_dir_path=temp_uri, + source_format="video", + parser_name="VideoParser", + meta={"content_type": "video", "format": format_str.lower()}, + ) + + async def _generate_video_description(self, file_path: Path, config: VideoConfig) -> str: + """ + Generate video description using key frames and audio transcription. + + Args: + file_path: Video file path + config: Video parsing configuration + + Returns: + Video description in markdown format + + TODO: Integrate with actual video processing libraries + """ + # Fallback implementation - returns basic placeholder + return "Video description (video processing integration pending)\n\nThis is a video. Video processing feature has not yet integrated external libraries." + + async def _generate_semantic_info( + self, node: ResourceNode, temp_uri: str, viking_fs, has_key_frames: bool + ): + """ + Phase 2: Generate abstract and overview. + + Args: + node: ResourceNode to update + temp_uri: Temporary URI + viking_fs: VikingFS instance + has_key_frames: Whether key frames directory exists + """ + # Read description.md + description = await viking_fs.read_file(f"{temp_uri}/description.md") + + # Generate abstract (short summary, < 100 tokens) + abstract = description[:200] if len(description) > 200 else description + + # Generate overview (content summary + file list + usage instructions) + overview_parts = [ + "## Content Summary\n", + description, + "\n\n## Available Files\n", + f"- content.{node.meta['format']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n", + "- description.md: Detailed video description\n", + ] + + if has_key_frames: + overview_parts.append("- keyframes/: Directory containing extracted key frames\n") + + overview_parts.append("\n## Usage\n") + overview_parts.append("### Play Video\n") + overview_parts.append("```python\n") + overview_parts.append("video_bytes = await video_resource.play()\n") + overview_parts.append("# Returns: Video file binary data\n") + overview_parts.append("# Purpose: Play or save the video\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get Video Description\n") + overview_parts.append("```python\n") + overview_parts.append("description = await video_resource.description()\n") + overview_parts.append("# Returns: FileContent object for further processing\n") + overview_parts.append("# Purpose: Understand video content\n") + overview_parts.append("```\n\n") + + if has_key_frames: + overview_parts.append("### Get Key Frames\n") + overview_parts.append("```python\n") + overview_parts.append("keyframes = await video_resource.keyframes()\n") + overview_parts.append("# Returns: List of key frame resources\n") + overview_parts.append("# Purpose: Analyze video scenes\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get Video Metadata\n") + overview_parts.append("```python\n") + overview_parts.append( + f"duration = video_resource.get_duration() # {node.meta['duration']}s\n" + ) + overview_parts.append( + f"resolution = video_resource.get_resolution() # ({node.meta['width']}, {node.meta['height']})\n" + ) + overview_parts.append(f"fps = video_resource.get_fps() # {node.meta['fps']}\n") + overview_parts.append(f'format = video_resource.get_format() # "{node.meta["format"]}"\n') + overview_parts.append("```\n") + + overview = "".join(overview_parts) + + # Store in node meta + node.meta["abstract"] = abstract + node.meta["overview"] = overview + + async def parse_content( + self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs + ) -> ParseResult: + """ + Parse video from content string - Not yet implemented. + + Args: + content: Video content (base64 or binary string) + source_path: Optional source path for metadata + **kwargs: Additional parsing parameters + + Returns: + ParseResult with video content + + Raises: + NotImplementedError: This feature is not yet implemented + """ + raise NotImplementedError("Video parsing from content not yet implemented") diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py index dd35062c..11af717f 100644 --- a/openviking/parse/registry.py +++ b/openviking/parse/registry.py @@ -74,12 +74,18 @@ def __init__(self, register_optional: bool = True): # Register optional media parsers if register_optional: try: - from openviking.parse.parsers.media import ImageParser + from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser self.register("image", ImageParser()) logger.info("Registered ImageParser for image formats") + + self.register("audio", AudioParser()) + logger.info("Registered AudioParser for audio formats") + + self.register("video", VideoParser()) + logger.info("Registered VideoParser for video formats") except ImportError as e: - logger.debug(f"ImageParser not registered: {e}") + logger.debug(f"Media parsers not registered: {e}") def register(self, name: str, parser: BaseParser) -> None: """ diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 35359044..b4bcb3dd 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -108,14 +108,48 @@ async def finalize_from_temp( doc_dirs = [e for e in entries if e.get("isDir") and e["name"] not in [".", ".."]] if len(doc_dirs) != 1: - raise ValueError(f"Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}") + logger.error( + f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}" + ) + raise ValueError( + f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}" + ) doc_name = doc_dirs[0]["name"] doc_uri = f"{temp_uri}/{doc_name}" # 2. Determine base_uri if base_uri is None: - base_uri = self._get_base_uri(scope) + # Check if it's a media file (image/audio/video) + media_type = None + if source_format: + if source_format in ["image", "audio", "video"]: + media_type = source_format + elif source_path: + from pathlib import Path + + ext = Path(source_path).suffix.lower() + image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] + audio_exts = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] + video_exts = [".mp4", ".mov", ".avi", ".webm", ".mkv"] + if ext in image_exts: + media_type = "image" + elif ext in audio_exts: + media_type = "audio" + elif ext in video_exts: + media_type = "video" + + if media_type: + # Map singular media types to plural directory names + media_dir_map = {"image": "images", "audio": "audio", "video": "video"} + media_dir = media_dir_map.get(media_type, media_type) + # Get current date in YYYYMMDD format + from datetime import datetime + + date_str = datetime.now().strftime("%Y%m%d") + base_uri = f"viking://resources/{media_dir}/{date_str}" + else: + base_uri = self._get_base_uri(scope) logger.info(f"Finalizing from temp: {temp_uri} -> {base_uri}") From 2f3ec8c6d8a2bf3759d26a8d31a9843b0b3a2563 Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 13:30:30 +0800 Subject: [PATCH 04/18] Enhance CLI robustness: validate add-resource path exists and detect unquoted spaces --- crates/ov_cli/src/main.rs | 25 ++++++++++++ openviking_cli/cli/commands/resources.py | 50 ++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index b98aaf4c..4c2eb9be 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -458,6 +458,31 @@ async fn handle_add_resource( timeout: Option, ctx: CliContext, ) -> Result<()> { + // Validate path: if it's a local path, check if it exists + if !path.starts_with("http://") && !path.starts_with("https://") { + use std::path::Path; + + let path_obj = Path::new(&path); + if !path_obj.exists() { + eprintln!("Error: Path '{}' does not exist.", path); + + // Check if there might be unquoted spaces + use std::env; + let args: Vec = env::args().collect(); + + if let Some(add_resource_pos) = args.iter().position(|s| s == "add-resource" || s == "add") { + if args.len() > add_resource_pos + 2 { + let extra_args = &args[add_resource_pos + 2..]; + let suggested_path = format!("{} {}", path, extra_args.join(" ")); + eprintln!("\nIt looks like you may have forgotten to quote a path with spaces."); + eprintln!("Suggested command: ov add-resource \"{}\"", suggested_path); + } + } + + std::process::exit(1); + } + } + let client = ctx.get_client(); commands::resources::add_resource( &client, &path, to, reason, instruction, wait, timeout, ctx.output_format, ctx.compact diff --git a/openviking_cli/cli/commands/resources.py b/openviking_cli/cli/commands/resources.py index 92940dd7..cd72b76a 100644 --- a/openviking_cli/cli/commands/resources.py +++ b/openviking_cli/cli/commands/resources.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """Resource management commands.""" +from pathlib import Path from typing import Optional import typer @@ -23,6 +24,55 @@ def add_resource_command( timeout: Optional[float] = typer.Option(600.0, help="Wait timeout in seconds"), ) -> None: """Add resources into OpenViking.""" + # Validate path: if it's a local path, check if it exists + if not (path.startswith("http://") or path.startswith("https://")): + local_path = Path(path) + if not local_path.exists(): + # Check if there are extra arguments (possible unquoted path with spaces) + import sys + + # Find the index of 'add-resource' in sys.argv + try: + add_resource_idx = sys.argv.index("add-resource") + except ValueError: + add_resource_idx = sys.argv.index("add") if "add" in sys.argv else -1 + + if add_resource_idx != -1 and len(sys.argv) > add_resource_idx + 2: + # There are extra positional arguments - likely unquoted path with spaces + extra_args = sys.argv[add_resource_idx + 2 :] + suggested_path = f"{path} {' '.join(extra_args)}" + typer.echo( + typer.style( + f"Error: Path '{path}' does not exist.", + fg=typer.colors.RED, + bold=True, + ), + err=True, + ) + typer.echo( + typer.style( + "\nIt looks like you may have forgotten to quote a path with spaces.", + fg=typer.colors.YELLOW, + ), + err=True, + ) + typer.echo( + typer.style( + f'Suggested command: ov add-resource "{suggested_path}"', + fg=typer.colors.CYAN, + ), + err=True, + ) + raise typer.Exit(code=1) + else: + typer.echo( + typer.style( + f"Error: Path '{path}' does not exist.", fg=typer.colors.RED, bold=True + ), + err=True, + ) + raise typer.Exit(code=1) + run( ctx, lambda client: client.add_resource( From c4b5960a6a94455ca5d79d41ffe57a02a65bad37 Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 13:37:40 +0800 Subject: [PATCH 05/18] Fix unescaped spaces in paths by replacing \ with space --- crates/ov_cli/src/main.rs | 7 +++++-- openviking_cli/cli/commands/resources.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index 4c2eb9be..e245a0e4 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -450,7 +450,7 @@ async fn main() { } async fn handle_add_resource( - path: String, + mut path: String, to: Option, reason: String, instruction: String, @@ -462,7 +462,9 @@ async fn handle_add_resource( if !path.starts_with("http://") && !path.starts_with("https://") { use std::path::Path; - let path_obj = Path::new(&path); + // Unescape path: replace backslash followed by space with just space + let unescaped_path = path.replace("\\ ", " "); + let path_obj = Path::new(&unescaped_path); if !path_obj.exists() { eprintln!("Error: Path '{}' does not exist.", path); @@ -481,6 +483,7 @@ async fn handle_add_resource( std::process::exit(1); } + path = unescaped_path; } let client = ctx.get_client(); diff --git a/openviking_cli/cli/commands/resources.py b/openviking_cli/cli/commands/resources.py index cd72b76a..a9bfc28f 100644 --- a/openviking_cli/cli/commands/resources.py +++ b/openviking_cli/cli/commands/resources.py @@ -25,8 +25,11 @@ def add_resource_command( ) -> None: """Add resources into OpenViking.""" # Validate path: if it's a local path, check if it exists + final_path = path if not (path.startswith("http://") or path.startswith("https://")): - local_path = Path(path) + unescaped_path = path.replace("\\ ", " ") + local_path = Path(unescaped_path) + final_path = unescaped_path if not local_path.exists(): # Check if there are extra arguments (possible unquoted path with spaces) import sys @@ -76,7 +79,7 @@ def add_resource_command( run( ctx, lambda client: client.add_resource( - path=path, + path=final_path, target=to, reason=reason, instruction=instruction, From da4386ad4f5aa29d50688c3dc50f15705a6599f4 Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 13:42:51 +0800 Subject: [PATCH 06/18] Sanitize URI components to replace spaces and special chars with underscores --- openviking/parse/parsers/media/audio.py | 4 +++- openviking/parse/parsers/media/image.py | 4 +++- openviking/parse/parsers/media/video.py | 4 +++- openviking/parse/tree_builder.py | 4 +++- openviking_cli/utils/uri.py | 4 ++-- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py index 372a0cab..e60e7fbe 100644 --- a/openviking/parse/parsers/media/audio.py +++ b/openviking/parse/parsers/media/audio.py @@ -94,7 +94,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) audio_bytes = file_path.read_bytes() ext = file_path.suffix - root_dir_name = file_path.stem + from openviking_cli.utils.uri import VikingURI + + root_dir_name = VikingURI.sanitize_segment(file_path.stem) root_dir_uri = f"{temp_uri}/{root_dir_name}" await viking_fs.mkdir(root_dir_uri) diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py index 0965b730..adde531c 100644 --- a/openviking/parse/parsers/media/image.py +++ b/openviking/parse/parsers/media/image.py @@ -106,7 +106,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) image_bytes = file_path.read_bytes() ext = file_path.suffix - root_dir_name = file_path.stem + from openviking_cli.utils.uri import VikingURI + + root_dir_name = VikingURI.sanitize_segment(file_path.stem) root_dir_uri = f"{temp_uri}/{root_dir_name}" await viking_fs.mkdir(root_dir_uri) diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py index 807816e1..fe50776d 100644 --- a/openviking/parse/parsers/media/video.py +++ b/openviking/parse/parsers/media/video.py @@ -95,7 +95,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) video_bytes = file_path.read_bytes() ext = file_path.suffix - root_dir_name = file_path.stem + from openviking_cli.utils.uri import VikingURI + + root_dir_name = VikingURI.sanitize_segment(file_path.stem) root_dir_uri = f"{temp_uri}/{root_dir_name}" await viking_fs.mkdir(root_dir_uri) diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index b4bcb3dd..11ff07be 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -115,7 +115,9 @@ async def finalize_from_temp( f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}" ) - doc_name = doc_dirs[0]["name"] + from openviking_cli.utils.uri import VikingURI + + doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"]) doc_uri = f"{temp_uri}/{doc_name}" # 2. Determine base_uri diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py index 9cf6d856..efa8744f 100644 --- a/openviking_cli/utils/uri.py +++ b/openviking_cli/utils/uri.py @@ -201,7 +201,7 @@ def build_semantic_uri( Build a semantic URI based on parent URI. """ # Sanitize semantic name for URI - safe_name = VikingURI._sanitize_segment(semantic_name) + safe_name = VikingURI.sanitize_segment(semantic_name) if not is_leaf: return f"{parent_uri}/{safe_name}" @@ -211,7 +211,7 @@ def build_semantic_uri( return f"{parent_uri}/{safe_name}/{node_id}" @staticmethod - def _sanitize_segment(text: str) -> str: + def sanitize_segment(text: str) -> str: """ Sanitize text for use in URI segment. From b2fdfc0225fa3db35ebdefbfb31e2f733c40f34a Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 16:15:40 +0800 Subject: [PATCH 07/18] feat: auto organize audio and image and video files --- openviking/parse/parsers/html.py | 6 +++- openviking/parse/parsers/markdown.py | 6 +++- openviking/parse/parsers/media/audio.py | 22 +++---------- openviking/parse/parsers/media/image.py | 22 +++---------- openviking/parse/parsers/media/video.py | 20 ++--------- openviking_cli/client/http.py | 27 +++++++++++++++ openviking_cli/utils/uri.py | 44 +++++++++++++++++++++++-- 7 files changed, 89 insertions(+), 58 deletions(-) diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index 28e47885..85fd0c0a 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -601,6 +601,10 @@ async def parse_content( def _sanitize_for_path(self, text: str) -> str: """Sanitize text for use in file path.""" - safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text) + safe = re.sub( + r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]", + "", + text, + ) safe = re.sub(r"\s+", "_", safe) return safe.strip("_")[:50] or "section" diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index e6ddbe3a..1570baf3 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -334,7 +334,11 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]: return parts if parts else [content] def _sanitize_for_path(self, text: str) -> str: - safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text) + safe = re.sub( + r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]", + "", + text, + ) safe = re.sub(r"\s+", "_", safe) return safe.strip("_")[:50] or "section" diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py index e60e7fbe..cfb7eaab 100644 --- a/openviking/parse/parsers/media/audio.py +++ b/openviking/parse/parsers/media/audio.py @@ -58,11 +58,10 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) Phase 1: Generate temporary files - Copy original audio to temp_uri/content.{ext} - - Generate description.md using ASR - (Optional) Generate transcript with timestamps Phase 2: Generate semantic info - - Generate abstract and overview based on description.md + - Generate abstract and overview based on description - Overview includes file list and usage instructions Phase 3: Build directory structure @@ -143,8 +142,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) # Fallback: basic description description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)" - await viking_fs.write_file(f"{root_dir_uri}/description.md", description) - # 1.4 Transcript with timestamps (optional) transcript_text = None if self.config.enable_transcription and self.config.enable_timestamps: @@ -175,7 +172,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) # Phase 2: Generate semantic info await self._generate_semantic_info( - root_node, root_dir_uri, viking_fs, transcript_text is not None + root_node, description, viking_fs, transcript_text is not None ) # Phase 3: Build directory structure (handled by TreeBuilder) @@ -223,20 +220,17 @@ async def _asr_transcribe_with_timestamps( return None async def _generate_semantic_info( - self, node: ResourceNode, temp_uri: str, viking_fs, has_transcript: bool + self, node: ResourceNode, description: str, viking_fs, has_transcript: bool ): """ Phase 2: Generate abstract and overview. Args: node: ResourceNode to update - temp_uri: Temporary URI + description: Audio description viking_fs: VikingFS instance has_transcript: Whether transcript file exists """ - # Read description.md - description = await viking_fs.read_file(f"{temp_uri}/description.md") - # Generate abstract (short summary, < 100 tokens) abstract = description[:200] if len(description) > 200 else description @@ -246,7 +240,6 @@ async def _generate_semantic_info( description, "\n\n## Available Files\n", f"- content.{node.meta['format']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n", - "- description.md: Detailed audio transcription generated by ASR\n", ] if has_transcript: @@ -260,13 +253,6 @@ async def _generate_semantic_info( overview_parts.append("# Purpose: Play or save the audio\n") overview_parts.append("```\n\n") - overview_parts.append("### Get ASR-generated Transcription\n") - overview_parts.append("```python\n") - overview_parts.append("transcription = await audio_resource.transcription()\n") - overview_parts.append("# Returns: FileContent object for further processing\n") - overview_parts.append("# Purpose: Understand audio content\n") - overview_parts.append("```\n\n") - if has_transcript: overview_parts.append("### Get Timestamps Transcript\n") overview_parts.append("```python\n") diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py index adde531c..544ba80c 100644 --- a/openviking/parse/parsers/media/image.py +++ b/openviking/parse/parsers/media/image.py @@ -70,11 +70,10 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) Phase 1: Generate temporary files - Copy original image to temp_uri/content.{ext} - - Generate description.md using VLM - (Optional) Generate ocr.md using OCR Phase 2: Generate semantic info - - Generate abstract and overview based on description.md + - Generate abstract and overview based on description - Overview includes file list and usage instructions Phase 3: Build directory structure @@ -134,8 +133,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) # Fallback: basic description description = f"Image file: {file_path.name} ({format_str}, {width}x{height})" - await viking_fs.write_file(f"{root_dir_uri}/description.md", description) - # 1.4 OCR (optional) ocr_text = None if self.config.enable_ocr: @@ -162,7 +159,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) ) # Phase 2: Generate semantic info - await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, ocr_text is not None) + await self._generate_semantic_info(root_node, description, viking_fs, ocr_text is not None) # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( @@ -207,20 +204,17 @@ async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]: return None async def _generate_semantic_info( - self, node: ResourceNode, temp_uri: str, viking_fs, has_ocr: bool + self, node: ResourceNode, description: str, viking_fs, has_ocr: bool ): """ Phase 2: Generate abstract and overview. Args: node: ResourceNode to update - temp_uri: Temporary URI + description: Image description viking_fs: VikingFS instance has_ocr: Whether OCR file exists """ - # Read description.md - description = await viking_fs.read_file(f"{temp_uri}/description.md") - # Generate abstract (short summary, < 100 tokens) abstract = description[:200] if len(description) > 200 else description @@ -230,7 +224,6 @@ async def _generate_semantic_info( description, "\n\n## Available Files\n", f"- content.{node.meta['format']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n", - "- description.md: Detailed image description generated by VLM\n", ] if has_ocr: @@ -244,13 +237,6 @@ async def _generate_semantic_info( overview_parts.append("# Purpose: Display or save the image\n") overview_parts.append("```\n\n") - overview_parts.append("### Get VLM-generated Image Description\n") - overview_parts.append("```python\n") - overview_parts.append("description = await image_resource.description()\n") - overview_parts.append("# Returns: FileContent object for further processing\n") - overview_parts.append("# Purpose: Understand image content\n") - overview_parts.append("```\n\n") - if has_ocr: overview_parts.append("### Get OCR-recognized Text\n") overview_parts.append("```python\n") diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py index fe50776d..ea274e2c 100644 --- a/openviking/parse/parsers/media/video.py +++ b/openviking/parse/parsers/media/video.py @@ -59,7 +59,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) Phase 1: Generate temporary files - Copy original video to temp_uri/content.{ext} - Extract key frames - - Generate description.md for each frame using VLM - Extract audio track and transcribe using ASR Phase 2: Generate semantic info @@ -145,8 +144,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) # Fallback: basic description description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)" - await viking_fs.write_file(f"{root_dir_uri}/description.md", description) - # 1.4 Key frames (optional) key_frames_dir = f"{root_dir_uri}/keyframes" has_key_frames = False @@ -175,7 +172,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) ) # Phase 2: Generate semantic info - await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, has_key_frames) + await self._generate_semantic_info(root_node, description, viking_fs, has_key_frames) # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( @@ -204,20 +201,17 @@ async def _generate_video_description(self, file_path: Path, config: VideoConfig return "Video description (video processing integration pending)\n\nThis is a video. Video processing feature has not yet integrated external libraries." async def _generate_semantic_info( - self, node: ResourceNode, temp_uri: str, viking_fs, has_key_frames: bool + self, node: ResourceNode, description: str, viking_fs, has_key_frames: bool ): """ Phase 2: Generate abstract and overview. Args: node: ResourceNode to update - temp_uri: Temporary URI + description: Video description viking_fs: VikingFS instance has_key_frames: Whether key frames directory exists """ - # Read description.md - description = await viking_fs.read_file(f"{temp_uri}/description.md") - # Generate abstract (short summary, < 100 tokens) abstract = description[:200] if len(description) > 200 else description @@ -227,7 +221,6 @@ async def _generate_semantic_info( description, "\n\n## Available Files\n", f"- content.{node.meta['format']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n", - "- description.md: Detailed video description\n", ] if has_key_frames: @@ -241,13 +234,6 @@ async def _generate_semantic_info( overview_parts.append("# Purpose: Play or save the video\n") overview_parts.append("```\n\n") - overview_parts.append("### Get Video Description\n") - overview_parts.append("```python\n") - overview_parts.append("description = await video_resource.description()\n") - overview_parts.append("# Returns: FileContent object for further processing\n") - overview_parts.append("# Purpose: Understand video content\n") - overview_parts.append("```\n\n") - if has_key_frames: overview_parts.append("### Get Key Frames\n") overview_parts.append("```python\n") diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index 39526602..a5cb6903 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -36,6 +36,7 @@ load_json_config, resolve_config_path, ) +from openviking_cli.utils.uri import VikingURI # Error code to exception class mapping ERROR_CODE_TO_EXCEPTION = { @@ -281,6 +282,7 @@ async def ls( node_limit: int = 1000, ) -> List[Any]: """List directory contents.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/fs/ls", params={ @@ -304,6 +306,7 @@ async def tree( node_limit: int = 1000, ) -> List[Dict[str, Any]]: """Get directory tree.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/fs/tree", params={ @@ -318,6 +321,7 @@ async def tree( async def stat(self, uri: str) -> Dict[str, Any]: """Get resource status.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/fs/stat", params={"uri": uri}, @@ -326,6 +330,7 @@ async def stat(self, uri: str) -> Dict[str, Any]: async def mkdir(self, uri: str) -> None: """Create directory.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/fs/mkdir", json={"uri": uri}, @@ -334,6 +339,7 @@ async def mkdir(self, uri: str) -> None: async def rm(self, uri: str, recursive: bool = False) -> None: """Remove resource.""" + uri = VikingURI.normalize(uri) response = await self._http.request( "DELETE", "/api/v1/fs", @@ -343,6 +349,8 @@ async def rm(self, uri: str, recursive: bool = False) -> None: async def mv(self, from_uri: str, to_uri: str) -> None: """Move resource.""" + from_uri = VikingURI.normalize(from_uri) + to_uri = VikingURI.normalize(to_uri) response = await self._http.post( "/api/v1/fs/mv", json={"from_uri": from_uri, "to_uri": to_uri}, @@ -353,6 +361,7 @@ async def mv(self, from_uri: str, to_uri: str) -> None: async def read(self, uri: str) -> str: """Read file content.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/content/read", params={"uri": uri}, @@ -361,6 +370,7 @@ async def read(self, uri: str) -> str: async def abstract(self, uri: str) -> str: """Read L0 abstract.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/content/abstract", params={"uri": uri}, @@ -369,6 +379,7 @@ async def abstract(self, uri: str) -> str: async def overview(self, uri: str) -> str: """Read L1 overview.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/content/overview", params={"uri": uri}, @@ -386,6 +397,8 @@ async def find( filter: Optional[Dict[str, Any]] = None, ) -> FindResult: """Semantic search without session context.""" + if target_uri: + target_uri = VikingURI.normalize(target_uri) response = await self._http.post( "/api/v1/search/find", json={ @@ -409,6 +422,8 @@ async def search( filter: Optional[Dict[str, Any]] = None, ) -> FindResult: """Semantic search with optional session context.""" + if target_uri: + target_uri = VikingURI.normalize(target_uri) sid = session_id or (session.session_id if session else None) response = await self._http.post( "/api/v1/search/search", @@ -425,6 +440,7 @@ async def search( async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) -> Dict[str, Any]: """Content search with pattern.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/search/grep", json={ @@ -437,6 +453,7 @@ async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) -> async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]: """File pattern matching.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/search/glob", json={"pattern": pattern, "uri": uri}, @@ -447,6 +464,7 @@ async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]: async def relations(self, uri: str) -> List[Any]: """Get relations for a resource.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/relations", params={"uri": uri}, @@ -455,6 +473,11 @@ async def relations(self, uri: str) -> List[Any]: async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str = "") -> None: """Create link between resources.""" + from_uri = VikingURI.normalize(from_uri) + if isinstance(to_uris, str): + to_uris = VikingURI.normalize(to_uris) + else: + to_uris = [VikingURI.normalize(u) for u in to_uris] response = await self._http.post( "/api/v1/relations/link", json={"from_uri": from_uri, "to_uris": to_uris, "reason": reason}, @@ -463,6 +486,8 @@ async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str async def unlink(self, from_uri: str, to_uri: str) -> None: """Remove link between resources.""" + from_uri = VikingURI.normalize(from_uri) + to_uri = VikingURI.normalize(to_uri) response = await self._http.request( "DELETE", "/api/v1/relations/link", @@ -512,6 +537,7 @@ async def add_message(self, session_id: str, role: str, content: str) -> Dict[st async def export_ovpack(self, uri: str, to: str) -> str: """Export context as .ovpack file.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/pack/export", json={"uri": uri, "to": to}, @@ -527,6 +553,7 @@ async def import_ovpack( vectorize: bool = True, ) -> str: """Import .ovpack file.""" + parent = VikingURI.normalize(parent) response = await self._http.post( "/api/v1/pack/import", json={ diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py index efa8744f..6f50a3a9 100644 --- a/openviking_cli/utils/uri.py +++ b/openviking_cli/utils/uri.py @@ -215,7 +215,8 @@ def sanitize_segment(text: str) -> str: """ Sanitize text for use in URI segment. - Preserves Chinese characters but replaces special characters. + Preserves CJK characters (Chinese, Japanese, Korean) and other common scripts + while replacing special characters. Args: text: Original text @@ -223,8 +224,18 @@ def sanitize_segment(text: str) -> str: Returns: URI-safe string """ - # Preserve Chinese characters, letters, numbers, underscores, hyphens - safe = re.sub(r"[^\w\u4e00-\u9fff\-]", "_", text) + # Preserve: + # - Letters, numbers, underscores, hyphens (\w includes [a-zA-Z0-9_]) + # - CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja) + # - Hiragana and Katakana (Japanese) + # - Hangul Syllables (Korean) + # - CJK Unified Ideographs Extension A + # - CJK Unified Ideographs Extension B + safe = re.sub( + r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-]", + "_", + text, + ) # Merge consecutive underscores safe = re.sub(r"_+", "_", safe) # Strip leading/trailing underscores and limit length @@ -245,6 +256,33 @@ def __eq__(self, other) -> bool: def __hash__(self) -> int: return hash(self.uri) + @staticmethod + def normalize(uri: str) -> str: + """ + Normalize URI by ensuring it has the viking:// scheme. + + If the input already starts with viking://, returns it as-is. + If it starts with /, prepends viking:// (resulting in viking:///... which is invalid, + so we strip leading / first). + Otherwise, prepends viking://. + + Examples: + "/resources/images" -> "viking://resources/images" + "resources/images" -> "viking://resources/images" + "viking://resources/images" -> "viking://resources/images" + + Args: + uri: Input URI string + + Returns: + Normalized URI with viking:// scheme + """ + if uri.startswith(f"{VikingURI.SCHEME}://"): + return uri + # Strip leading slashes + uri = uri.lstrip("/") + return f"{VikingURI.SCHEME}://{uri}" + @classmethod def create_temp_uri(cls) -> str: """Create temp directory URI like viking://temp/MMDDHHMM_XXXXXX""" From 1118919d6d580963c4482bee497d969154c935a3 Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 16:41:05 +0800 Subject: [PATCH 08/18] Update media parsers to use original filenames and folder names with extensions --- openviking/parse/parsers/README.md | 5 +++-- openviking/parse/parsers/media/audio.py | 14 ++++++++++---- openviking/parse/parsers/media/image.py | 14 ++++++++++---- openviking/parse/parsers/media/video.py | 14 ++++++++++---- 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md index 7f452eb0..34eafd2e 100644 --- a/openviking/parse/parsers/README.md +++ b/openviking/parse/parsers/README.md @@ -155,14 +155,15 @@ L1: """ * viking://resource/video 用于存储提交时未明确指定目标路径的视频文件 * 对于每个媒体子目录下,每次上传的文件放在当前日期(而非文件内部元信息时间)的子目录下,例如: * viking://resource/images/20240820/ 内存储 20240820 上传的所有图片文件 -* 对于每个多媒体文件,默认创建一个文件夹,文件夹名称与文件名想同但默认不包含后缀,例如: - * 上传文件 `20240820_123456.jpg` 后,默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456` 用于存储该文件的相关内容 +* 对于每个多媒体文件,默认创建一个文件夹,文件夹名称与文件名想同但默认包含后缀,例如: + * 上传文件 `20240820_123456.jpg` 后,默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456_jpg` 用于存储该文件的相关内容 * 该文件夹内默认包含一个 `.abstract.md` 文件,用于存储该文件的摘要信息 * 例如:图片文件的摘要可能是图片的文件名、内容描述、画面风格等,正常不应超过 200 token * 该文件夹内默认包含一个 `.overview.md` 文件,用于存储该文件的概览内容,例如: * 图片文件的概览内容除了包含 `.abstract.md` 中的内容,还可能包含图片的尺寸、画面风格、OCR 识别结果、场景和主体描述等 * 音频文件的概览内容可能包含音频的文件名、时长、语音或歌词识别结果,以及主要的章节对应的时间线等 * 视频文件的概览内容可能包含视频的文件名、时长、使用场景等,对于较大的视频,未来会对视频进行切分,可能会继续递归用子文件夹存储切分后的视频文件、音轨文件、关键画面的截图等,因此视频的处理逻辑预期较为复杂,可等待图片、音频实现后,参考文件夹或 zip 的递归处理形态进行处理。 + * 该文件夹内需要放置原始文件,保持原始文件名,例如 `20240820_123456.jpg`,但如果文件名包含空格字符,需要将其替换为下划线 `_`,因为 OpenViking URI 不允许包含空格字符。 ## 核心组件 diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py index cfb7eaab..f0a018c3 100644 --- a/openviking/parse/parsers/media/audio.py +++ b/openviking/parse/parsers/media/audio.py @@ -95,12 +95,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) from openviking_cli.utils.uri import VikingURI - root_dir_name = VikingURI.sanitize_segment(file_path.stem) + # Sanitize original filename (replace spaces with underscores) + original_filename = file_path.name.replace(" ", "_") + # Root directory name: filename stem + _ + extension (without dot) + stem = file_path.stem.replace(" ", "_") + ext_no_dot = ext[1:] if ext else "" + root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}") root_dir_uri = f"{temp_uri}/{root_dir_name}" await viking_fs.mkdir(root_dir_uri) - # 1.1 Save original audio - await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", audio_bytes) + # 1.1 Save original audio with original filename (sanitized) + await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", audio_bytes) # 1.2 Validate audio file using magic bytes # Define magic bytes for supported audio formats @@ -167,6 +172,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) "content_type": "audio", "source_title": file_path.stem, "semantic_name": file_path.stem, + "original_filename": original_filename, }, ) @@ -239,7 +245,7 @@ async def _generate_semantic_info( "## Content Summary\n", description, "\n\n## Available Files\n", - f"- content.{node.meta['format']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n", + f"- {node.meta['original_filename']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n", ] if has_transcript: diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py index 544ba80c..c82b9589 100644 --- a/openviking/parse/parsers/media/image.py +++ b/openviking/parse/parsers/media/image.py @@ -107,12 +107,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) from openviking_cli.utils.uri import VikingURI - root_dir_name = VikingURI.sanitize_segment(file_path.stem) + # Sanitize original filename (replace spaces with underscores) + original_filename = file_path.name.replace(" ", "_") + # Root directory name: filename stem + _ + extension (without dot) + stem = file_path.stem.replace(" ", "_") + ext_no_dot = ext[1:] if ext else "" + root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}") root_dir_uri = f"{temp_uri}/{root_dir_name}" await viking_fs.mkdir(root_dir_uri) - # 1.1 Save original image - await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", image_bytes) + # 1.1 Save original image with original filename (sanitized) + await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", image_bytes) # 1.2 Validate and extract image metadata try: @@ -155,6 +160,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) "content_type": "image", "source_title": file_path.stem, "semantic_name": file_path.stem, + "original_filename": original_filename, }, ) @@ -223,7 +229,7 @@ async def _generate_semantic_info( "## Content Summary\n", description, "\n\n## Available Files\n", - f"- content.{node.meta['format']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n", + f"- {node.meta['original_filename']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n", ] if has_ocr: diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py index ea274e2c..53cccf67 100644 --- a/openviking/parse/parsers/media/video.py +++ b/openviking/parse/parsers/media/video.py @@ -96,12 +96,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) from openviking_cli.utils.uri import VikingURI - root_dir_name = VikingURI.sanitize_segment(file_path.stem) + # Sanitize original filename (replace spaces with underscores) + original_filename = file_path.name.replace(" ", "_") + # Root directory name: filename stem + _ + extension (without dot) + stem = file_path.stem.replace(" ", "_") + ext_no_dot = ext[1:] if ext else "" + root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}") root_dir_uri = f"{temp_uri}/{root_dir_name}" await viking_fs.mkdir(root_dir_uri) - # 1.1 Save original video - await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", video_bytes) + # 1.1 Save original video with original filename (sanitized) + await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", video_bytes) # 1.2 Validate video file using magic bytes # Define magic bytes for supported video formats @@ -168,6 +173,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) "content_type": "video", "source_title": file_path.stem, "semantic_name": file_path.stem, + "original_filename": original_filename, }, ) @@ -220,7 +226,7 @@ async def _generate_semantic_info( "## Content Summary\n", description, "\n\n## Available Files\n", - f"- content.{node.meta['format']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n", + f"- {node.meta['original_filename']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n", ] if has_key_frames: From b1fc5915e816eeaa051293a6f9f64bb01f503dde Mon Sep 17 00:00:00 2001 From: openviking Date: Mon, 16 Feb 2026 17:05:25 +0800 Subject: [PATCH 09/18] Optimize MediaParser section for readability --- openviking/parse/parsers/README.md | 72 +++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md index 34eafd2e..94fcc23c 100644 --- a/openviking/parse/parsers/README.md +++ b/openviking/parse/parsers/README.md @@ -144,26 +144,64 @@ L1: """ 代码解析器,支持语法高亮和代码结构分析。能识别函数、类、方法等代码元素。 ### 6. MediaParser (`media.py`) -**支持格式**: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.mp4`, `.mov`, `.avi`, `.webm`, `.mp3`, `.wav`, `.m4a`, `.flac` + +**支持格式**: +- 图片: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp` +- 视频: `.mp4`, `.mov`, `.avi`, `.webm` +- 音频: `.mp3`, `.wav`, `.m4a`, `.flac` 多媒体解析器,使用 VLM(视觉语言模型)分析图像、视频和音频内容,生成文本描述。 -对于添加多媒体文件的存储组织方式,我们采用以下策略: -* 在 viking://resource 下创建 images, audio, video 三个媒体子目录,分别是: - * viking://resource/images 用于存储提交时未明确指定目标路径的图片文件 - * viking://resource/audio 用于存储提交时未明确指定目标路径的音频文件 - * viking://resource/video 用于存储提交时未明确指定目标路径的视频文件 -* 对于每个媒体子目录下,每次上传的文件放在当前日期(而非文件内部元信息时间)的子目录下,例如: - * viking://resource/images/20240820/ 内存储 20240820 上传的所有图片文件 -* 对于每个多媒体文件,默认创建一个文件夹,文件夹名称与文件名想同但默认包含后缀,例如: - * 上传文件 `20240820_123456.jpg` 后,默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456_jpg` 用于存储该文件的相关内容 - * 该文件夹内默认包含一个 `.abstract.md` 文件,用于存储该文件的摘要信息 - * 例如:图片文件的摘要可能是图片的文件名、内容描述、画面风格等,正常不应超过 200 token - * 该文件夹内默认包含一个 `.overview.md` 文件,用于存储该文件的概览内容,例如: - * 图片文件的概览内容除了包含 `.abstract.md` 中的内容,还可能包含图片的尺寸、画面风格、OCR 识别结果、场景和主体描述等 - * 音频文件的概览内容可能包含音频的文件名、时长、语音或歌词识别结果,以及主要的章节对应的时间线等 - * 视频文件的概览内容可能包含视频的文件名、时长、使用场景等,对于较大的视频,未来会对视频进行切分,可能会继续递归用子文件夹存储切分后的视频文件、音轨文件、关键画面的截图等,因此视频的处理逻辑预期较为复杂,可等待图片、音频实现后,参考文件夹或 zip 的递归处理形态进行处理。 - * 该文件夹内需要放置原始文件,保持原始文件名,例如 `20240820_123456.jpg`,但如果文件名包含空格字符,需要将其替换为下划线 `_`,因为 OpenViking URI 不允许包含空格字符。 +#### 存储组织策略 + +多媒体文件的存储采用以下层级结构: + +``` +viking://resource/ +├── images/ # 图片文件 +│ └── 20240820/ # 上传日期(YYYYMMDD) +│ └── 20240820_123456_jpg/ # 文件文件夹(文件名_扩展名) +│ ├── .abstract.md # L0 摘要 +│ ├── .overview.md # L1 概览 +│ └── 20240820_123456.jpg # 原始文件 +├── audio/ # 音频文件 +│ └── 20240820/ +│ └── my_song_mp3/ +│ ├── .abstract.md +│ ├── .overview.md +│ └── my_song.mp3 +└── video/ # 视频文件 + └── 20240820/ + └── my_video_mp4/ + ├── .abstract.md + ├── .overview.md + └── my_video.mp4 +``` + +详细说明: + +1. **媒体子目录**: 在 `viking://resource` 下按类型划分为三个子目录 + - `viking://resource/images`: 存储未明确指定目标路径的图片文件 + - `viking://resource/audio`: 存储未明确指定目标路径的音频文件 + - `viking://resource/video`: 存储未明确指定目标路径的视频文件 + +2. **日期子目录**: 每次上传的文件按当前日期(格式:YYYYMMDD)组织,而非文件内部元信息的时间 + - 例如:`viking://resource/images/20240820/` 存储 2024年8月20日上传的所有图片 + +3. **文件文件夹**: 为每个多媒体文件创建一个专属文件夹,命名规则为:`文件名_扩展名`(扩展名不含点) + - 示例:上传 `20240820_123456.jpg` → 创建文件夹 `20240820_123456_jpg` + +4. **文件夹内容**: 每个文件文件夹内包含: + - **原始文件**: 保持原始文件名,空格字符替换为下划线 `_`(因为 OpenViking URI 不允许包含空格) + - 示例:`photo 1.jpg` → 保存为 `photo_1.jpg` + - `.abstract.md` (L0 层): 摘要信息(<200 token) + - 图片:文件名、内容描述、画面风格等 + - 音频:文件名、时长、内容概述等 + - 视频:文件名、时长、内容概述等 + - `.overview.md` (L1 层): 概览信息 + - 图片:除摘要外,还包含尺寸、OCR 识别结果、场景和主体描述等 + - 音频:除摘要外,还包含语音/歌词识别结果、章节时间线等 + - 视频:除摘要外,还包含使用场景等(未来会支持切分视频,递归存储子文件) ## 核心组件 From 3e3483b9f3321c29846501627d326e27e166a16a Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 17 Feb 2026 23:31:43 +0800 Subject: [PATCH 10/18] feat: vlm optimization for image --- examples/chatmem/ov.conf.example | 3 +- examples/mcp-query/ov.conf.example | 3 +- examples/memex/ov.conf.example | 3 +- examples/ov.conf.example | 3 +- examples/query/ov.conf.example | 3 +- examples/server_client/ov.conf.example | 3 +- openviking/models/vlm/backends/litellm_vlm.py | 26 +++++++++++++-- openviking/models/vlm/backends/openai_vlm.py | 32 +++++++++++++------ .../models/vlm/backends/volcengine_vlm.py | 16 ++++++---- openviking/models/vlm/base.py | 12 +++++-- openviking/models/vlm/llm.py | 22 +++++++++---- openviking_cli/utils/config/vlm_config.py | 28 +++++++++------- 12 files changed, 109 insertions(+), 45 deletions(-) diff --git a/examples/chatmem/ov.conf.example b/examples/chatmem/ov.conf.example index 2e9a40ae..6a085e5d 100644 --- a/examples/chatmem/ov.conf.example +++ b/examples/chatmem/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3", "api_key" : "not_gonna_give_u_this", "backend" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/mcp-query/ov.conf.example b/examples/mcp-query/ov.conf.example index bf4a45de..fc40ea92 100644 --- a/examples/mcp-query/ov.conf.example +++ b/examples/mcp-query/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3", "api_key" : "", "provider" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/memex/ov.conf.example b/examples/memex/ov.conf.example index d5187cd3..42697b72 100644 --- a/examples/memex/ov.conf.example +++ b/examples/memex/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark.cn-beijing.volces.com/api/v3", "api_key" : "your-volcengine-api-key", "backend" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/ov.conf.example b/examples/ov.conf.example index 205cd7d9..34cbc6a4 100644 --- a/examples/ov.conf.example +++ b/examples/ov.conf.example @@ -45,7 +45,8 @@ "api_base": "https://ark.cn-beijing.volces.com/api/v3", "temperature": 0.0, "max_retries": 2, - "provider": "volcengine" + "provider": "volcengine", + "thinking": false }, "rerank": { "ak": null, diff --git a/examples/query/ov.conf.example b/examples/query/ov.conf.example index 58d034c0..fdc7cb55 100644 --- a/examples/query/ov.conf.example +++ b/examples/query/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3", "api_key" : "not_gonna_give_u_this", "provider" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/server_client/ov.conf.example b/examples/server_client/ov.conf.example index 582d79b8..13eb55db 100644 --- a/examples/server_client/ov.conf.example +++ b/examples/server_client/ov.conf.example @@ -34,6 +34,7 @@ "api_base": "https://ark.cn-beijing.volces.com/api/v3", "temperature": 0.0, "max_retries": 2, - "provider": "volcengine" + "provider": "volcengine", + "thinking": false } } diff --git a/openviking/models/vlm/backends/litellm_vlm.py b/openviking/models/vlm/backends/litellm_vlm.py index f1efa562..2373e5dd 100644 --- a/openviking/models/vlm/backends/litellm_vlm.py +++ b/openviking/models/vlm/backends/litellm_vlm.py @@ -139,21 +139,31 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]: return kwargs - def get_completion(self, prompt: str) -> str: + def get_completion(self, prompt: str, thinking: bool = False) -> str: """Get text completion synchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") messages = [{"role": "user", "content": prompt}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking response = completion(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: """Get text completion asynchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") messages = [{"role": "user", "content": prompt}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking last_error = None for attempt in range(max_retries + 1): @@ -164,7 +174,7 @@ async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: except Exception as e: last_error = e if attempt < max_retries: - await asyncio.sleep(2 ** attempt) + await asyncio.sleep(2**attempt) if last_error: raise last_error @@ -174,6 +184,7 @@ def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion synchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") @@ -184,7 +195,11 @@ def get_vision_completion( content.append({"type": "text", "text": prompt}) messages = [{"role": "user", "content": content}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking response = completion(**kwargs) self._update_token_usage_from_response(response) @@ -194,6 +209,7 @@ async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion asynchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") @@ -204,7 +220,11 @@ async def get_vision_completion_async( content.append({"type": "text", "text": prompt}) messages = [{"role": "user", "content": content}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking response = await acompletion(**kwargs) self._update_token_usage_from_response(response) diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py index c6c5b230..18a22ff7 100644 --- a/openviking/models/vlm/backends/openai_vlm.py +++ b/openviking/models/vlm/backends/openai_vlm.py @@ -131,6 +131,7 @@ def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion""" client = self.get_client() @@ -140,11 +141,16 @@ def get_vision_completion( content.append(self._prepare_image(img)) content.append({"type": "text", "text": prompt}) - response = client.chat.completions.create( - model=self.model or "gpt-4o-mini", - messages=[{"role": "user", "content": content}], - temperature=self.temperature, - ) + kwargs = { + "model": self.model or "gpt-4o-mini", + "messages": [{"role": "user", "content": content}], + "temperature": self.temperature, + } + + if self.provider == "volcengine": + kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} + + response = client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" @@ -152,6 +158,7 @@ async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion asynchronously""" client = self.get_async_client() @@ -161,10 +168,15 @@ async def get_vision_completion_async( content.append(self._prepare_image(img)) content.append({"type": "text", "text": prompt}) - response = await client.chat.completions.create( - model=self.model or "gpt-4o-mini", - messages=[{"role": "user", "content": content}], - temperature=self.temperature, - ) + kwargs = { + "model": self.model or "gpt-4o-mini", + "messages": [{"role": "user", "content": content}], + "temperature": self.temperature, + } + + if self.provider == "volcengine": + kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} + + response = await client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py index f11a289e..b5841cc8 100644 --- a/openviking/models/vlm/backends/volcengine_vlm.py +++ b/openviking/models/vlm/backends/volcengine_vlm.py @@ -54,22 +54,26 @@ def get_async_client(self): ) return self._async_client - def get_completion(self, prompt: str) -> str: - return super().get_completion(prompt) + def get_completion(self, prompt: str, thinking: bool = False) -> str: + return super().get_completion(prompt, thinking) - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: - return await super().get_completion_async(prompt, max_retries) + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: + return await super().get_completion_async(prompt, thinking, max_retries) def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: - return super().get_vision_completion(prompt, images) + return super().get_vision_completion(prompt, images, thinking) async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: - return await super().get_vision_completion_async(prompt, images) + return await super().get_vision_completion_async(prompt, images, thinking) diff --git a/openviking/models/vlm/base.py b/openviking/models/vlm/base.py index ef55f712..cd563f9c 100644 --- a/openviking/models/vlm/base.py +++ b/openviking/models/vlm/base.py @@ -27,12 +27,14 @@ def __init__(self, config: Dict[str, Any]): self._token_tracker = TokenUsageTracker() @abstractmethod - def get_completion(self, prompt: str) -> str: + def get_completion(self, prompt: str, thinking: bool = False) -> str: """Get text completion""" pass @abstractmethod - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: """Get text completion asynchronously""" pass @@ -41,6 +43,7 @@ def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion""" pass @@ -50,6 +53,7 @@ async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion asynchronously""" pass @@ -128,16 +132,20 @@ def create(config: Dict[str, Any]) -> VLMBase: if not use_litellm: if provider == "openai": from .backends.openai_vlm import OpenAIVLM + return OpenAIVLM(config) elif provider == "volcengine": from .backends.volcengine_vlm import VolcEngineVLM + return VolcEngineVLM(config) from .backends.litellm_vlm import LiteLLMVLMProvider + return LiteLLMVLMProvider(config) @staticmethod def get_available_providers() -> List[str]: """Get list of available providers""" from .registry import get_all_provider_names + return get_all_provider_names() diff --git a/openviking/models/vlm/llm.py b/openviking/models/vlm/llm.py index 07c52179..6c8b9c56 100644 --- a/openviking/models/vlm/llm.py +++ b/openviking/models/vlm/llm.py @@ -168,34 +168,38 @@ def complete_json( self, prompt: str, schema: Optional[Dict[str, Any]] = None, + thinking: bool = False, ) -> Optional[Dict[str, Any]]: """Get JSON completion from VLM.""" if schema: prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}" - response = self._get_vlm().get_completion(prompt) + response = self._get_vlm().get_completion(prompt, thinking) return parse_json_from_response(response) async def complete_json_async( self, prompt: str, schema: Optional[Dict[str, Any]] = None, + thinking: bool = False, + max_retries: int = 0, ) -> Optional[Dict[str, Any]]: """Async version of complete_json.""" if schema: prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}" - response = await self._get_vlm().get_completion_async(prompt) + response = await self._get_vlm().get_completion_async(prompt, thinking, max_retries) return parse_json_from_response(response) def complete_model( self, prompt: str, model_class: Type[T], + thinking: bool = False, ) -> Optional[T]: """Get structured completion validated against a Pydantic model.""" schema = model_class.model_json_schema() - response = self.complete_json(prompt, schema=schema) + response = self.complete_json(prompt, schema=schema, thinking=thinking) if response is None: return None @@ -209,10 +213,14 @@ async def complete_model_async( self, prompt: str, model_class: Type[T], + thinking: bool = False, + max_retries: int = 0, ) -> Optional[T]: """Async version of complete_model.""" schema = model_class.model_json_schema() - response = await self.complete_json_async(prompt, schema=schema) + response = await self.complete_json_async( + prompt, schema=schema, thinking=thinking, max_retries=max_retries + ) if response is None: return None @@ -226,14 +234,16 @@ def get_vision_completion( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Get vision completion.""" - return self._get_vlm().get_vision_completion(prompt, images) + return self._get_vlm().get_vision_completion(prompt, images, thinking) async def get_vision_completion_async( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Async vision completion.""" - return await self._get_vlm().get_vision_completion_async(prompt, images) + return await self._get_vlm().get_vision_completion_async(prompt, images, thinking) diff --git a/openviking_cli/utils/config/vlm_config.py b/openviking_cli/utils/config/vlm_config.py index ad1bea8f..411c7d76 100644 --- a/openviking_cli/utils/config/vlm_config.py +++ b/openviking_cli/utils/config/vlm_config.py @@ -15,17 +15,16 @@ class VLMConfig(BaseModel): max_retries: int = Field(default=2, description="Maximum retry attempts") provider: Optional[str] = Field(default=None, description="Provider type") - backend: Optional[str] = Field(default=None, description="Backend provider (Deprecated, use 'provider' instead)") + backend: Optional[str] = Field( + default=None, description="Backend provider (Deprecated, use 'provider' instead)" + ) providers: Dict[str, Dict[str, Any]] = Field( default_factory=dict, - description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}" + description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}", ) - default_provider: Optional[str] = Field( - default=None, - description="Default provider name" - ) + default_provider: Optional[str] = Field(default=None, description="Default provider name") thinking: bool = Field(default=False, description="Enable thinking mode for VolcEngine models") @@ -141,6 +140,7 @@ def get_vlm_instance(self) -> Any: if self._vlm_instance is None: config_dict = self._build_vlm_config_dict() from openviking.models.vlm import VLMFactory + self._vlm_instance = VLMFactory.create(config_dict) return self._vlm_instance @@ -166,13 +166,15 @@ def _build_vlm_config_dict(self) -> Dict[str, Any]: return result - def get_completion(self, prompt: str) -> str: + def get_completion(self, prompt: str, thinking: bool = False) -> str: """Get LLM completion.""" - return self.get_vlm_instance().get_completion(prompt) + return self.get_vlm_instance().get_completion(prompt, thinking) - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: """Get LLM completion asynchronously, max_retries=0 means no retry.""" - return await self.get_vlm_instance().get_completion_async(prompt, max_retries) + return await self.get_vlm_instance().get_completion_async(prompt, thinking, max_retries) def is_available(self) -> bool: """Check if LLM is configured.""" @@ -182,14 +184,16 @@ def get_vision_completion( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Get LLM completion with images.""" - return self.get_vlm_instance().get_vision_completion(prompt, images) + return self.get_vlm_instance().get_vision_completion(prompt, images, thinking) async def get_vision_completion_async( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Get LLM completion with images asynchronously.""" - return await self.get_vlm_instance().get_vision_completion_async(prompt, images) + return await self.get_vlm_instance().get_vision_completion_async(prompt, images, thinking) From 5b818f5d888ec2df9bd77479c5b930055c08a4f6 Mon Sep 17 00:00:00 2001 From: openviking Date: Thu, 19 Feb 2026 17:07:25 +0800 Subject: [PATCH 11/18] feat: vlm optimization for image --- openviking/models/vlm/backends/openai_vlm.py | 12 -- .../models/vlm/backends/volcengine_vlm.py | 107 +++++++++++++++++- openviking/parse/directory_scan.py | 1 + openviking/parse/parsers/README.md | 6 +- openviking/parse/parsers/directory.py | 67 +++++++++-- openviking/parse/parsers/media/audio.py | 3 +- openviking/parse/parsers/media/constants.py | 15 +++ openviking/parse/parsers/media/image.py | 54 +++++++-- openviking/parse/parsers/media/video.py | 3 +- openviking/parse/registry.py | 67 +++++------ .../templates/parsing/image_summary.yaml | 31 +++++ .../storage/queuefs/semantic_processor.py | 20 +++- openviking_cli/utils/config/parser_config.py | 2 +- tests/parse/test_add_directory.py | 101 +++++++++++++++++ 14 files changed, 408 insertions(+), 81 deletions(-) create mode 100644 openviking/parse/parsers/media/constants.py create mode 100644 openviking/prompts/templates/parsing/image_summary.yaml diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py index 18a22ff7..d6f6effa 100644 --- a/openviking/models/vlm/backends/openai_vlm.py +++ b/openviking/models/vlm/backends/openai_vlm.py @@ -61,9 +61,6 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str: "temperature": self.temperature, } - if self.provider == "volcengine": - kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} - response = client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" @@ -79,9 +76,6 @@ async def get_completion_async( "temperature": self.temperature, } - if self.provider == "volcengine": - kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} - last_error = None for attempt in range(max_retries + 1): try: @@ -147,9 +141,6 @@ def get_vision_completion( "temperature": self.temperature, } - if self.provider == "volcengine": - kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} - response = client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" @@ -174,9 +165,6 @@ async def get_vision_completion_async( "temperature": self.temperature, } - if self.provider == "volcengine": - kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} - response = await client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py index b5841cc8..e4c4d290 100644 --- a/openviking/models/vlm/backends/volcengine_vlm.py +++ b/openviking/models/vlm/backends/volcengine_vlm.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """VolcEngine VLM backend implementation""" +import asyncio +import base64 from pathlib import Path from typing import Any, Dict, List, Union @@ -55,12 +57,75 @@ def get_async_client(self): return self._async_client def get_completion(self, prompt: str, thinking: bool = False) -> str: - return super().get_completion(prompt, thinking) + """Get text completion""" + client = self.get_client() + kwargs = { + "model": self.model or "doubao-seed-1-8-251228", + "messages": [{"role": "user", "content": prompt}], + "temperature": self.temperature, + "thinking": {"type": "disabled" if not thinking else "enabled"}, + } + + response = client.chat.completions.create(**kwargs) + self._update_token_usage_from_response(response) + return response.choices[0].message.content or "" async def get_completion_async( self, prompt: str, thinking: bool = False, max_retries: int = 0 ) -> str: - return await super().get_completion_async(prompt, thinking, max_retries) + """Get text completion asynchronously""" + client = self.get_async_client() + kwargs = { + "model": self.model or "doubao-seed-1-8-251228", + "messages": [{"role": "user", "content": prompt}], + "temperature": self.temperature, + "thinking": {"type": "disabled" if not thinking else "enabled"}, + } + + last_error = None + for attempt in range(max_retries + 1): + try: + response = await client.chat.completions.create(**kwargs) + self._update_token_usage_from_response(response) + return response.choices[0].message.content or "" + except Exception as e: + last_error = e + if attempt < max_retries: + await asyncio.sleep(2**attempt) + + if last_error: + raise last_error + else: + raise RuntimeError("Unknown error in async completion") + + def _prepare_image(self, image: Union[str, Path, bytes]) -> Dict[str, Any]: + """Prepare image data""" + if isinstance(image, bytes): + b64 = base64.b64encode(image).decode("utf-8") + return { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{b64}"}, + } + elif isinstance(image, Path) or ( + isinstance(image, str) and not image.startswith(("http://", "https://")) + ): + path = Path(image) + suffix = path.suffix.lower() + mime_type = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + }.get(suffix, "image/png") + with open(path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return { + "type": "image_url", + "image_url": {"url": f"data:{mime_type};base64,{b64}"}, + } + else: + return {"type": "image_url", "image_url": {"url": image}} def get_vision_completion( self, @@ -68,7 +133,24 @@ def get_vision_completion( images: List[Union[str, Path, bytes]], thinking: bool = False, ) -> str: - return super().get_vision_completion(prompt, images, thinking) + """Get vision completion""" + client = self.get_client() + + content = [] + for img in images: + content.append(self._prepare_image(img)) + content.append({"type": "text", "text": prompt}) + + kwargs = { + "model": self.model or "doubao-seed-1-8-251228", + "messages": [{"role": "user", "content": content}], + "temperature": self.temperature, + "thinking": {"type": "disabled" if not thinking else "enabled"}, + } + + response = client.chat.completions.create(**kwargs) + self._update_token_usage_from_response(response) + return response.choices[0].message.content or "" async def get_vision_completion_async( self, @@ -76,4 +158,21 @@ async def get_vision_completion_async( images: List[Union[str, Path, bytes]], thinking: bool = False, ) -> str: - return await super().get_vision_completion_async(prompt, images, thinking) + """Get vision completion asynchronously""" + client = self.get_async_client() + + content = [] + for img in images: + content.append(self._prepare_image(img)) + content.append({"type": "text", "text": prompt}) + + kwargs = { + "model": self.model or "doubao-seed-1-8-251228", + "messages": [{"role": "user", "content": content}], + "temperature": self.temperature, + "thinking": {"type": "disabled" if not thinking else "enabled"}, + } + + response = await client.chat.completions.create(**kwargs) + self._update_token_usage_from_response(response) + return response.choices[0].message.content or "" diff --git a/openviking/parse/directory_scan.py b/openviking/parse/directory_scan.py index 2b62b6c1..8da532f2 100644 --- a/openviking/parse/directory_scan.py +++ b/openviking/parse/directory_scan.py @@ -164,6 +164,7 @@ def _classify_file( Processable: ParserRegistry has a parser, or is_text_file (code/config/docs). """ + # Normal classification logic if registry.get_parser_for_file(file_path) is not None: return CLASS_PROCESSABLE if is_text_file(file_path): diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md index 94fcc23c..ceffac29 100644 --- a/openviking/parse/parsers/README.md +++ b/openviking/parse/parsers/README.md @@ -143,14 +143,16 @@ L1: """ 代码解析器,支持语法高亮和代码结构分析。能识别函数、类、方法等代码元素。 -### 6. MediaParser (`media.py`) +### 6. MediaParser (`media/*.py`) **支持格式**: - 图片: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp` - 视频: `.mp4`, `.mov`, `.avi`, `.webm` - 音频: `.mp3`, `.wav`, `.m4a`, `.flac` -多媒体解析器,使用 VLM(视觉语言模型)分析图像、视频和音频内容,生成文本描述。 +多媒体解析器,使用 VLM(视觉语言模型)分析图像、视频和音频内容,生成文本描述。多媒体解析器当且仅当 add-resource 调用时只添加上述文件类型时生效。即: +1. 当添加目录时,系统将对多媒体文件暂不生成单独目录和文本描述,仅存储和进行递归摘要。 +2. 当单独添加多媒体文件时,多媒体解析器会直接解析该文件,然后通过单独目录存放,在目录下生成文本描述。 #### 存储组织策略 diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py index 5f2f05fa..6fc6d8a6 100644 --- a/openviking/parse/parsers/directory.py +++ b/openviking/parse/parsers/directory.py @@ -27,6 +27,7 @@ create_parse_result, ) from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.media.constants import MEDIA_EXTENSIONS from openviking_cli.utils.logger import get_logger if TYPE_CHECKING: @@ -74,7 +75,8 @@ async def parse( source: Path to the directory. instruction: Processing instruction (forwarded where applicable). **kwargs: Extra options forwarded to ``scan_directory``: - ``strict``, ``ignore_dirs``, ``include``, ``exclude``. + ``strict``, ``ignore_dirs``, ``include``, ``exclude``, + ``directly_upload_media``. Returns: ``ParseResult`` with ``temp_dir_path`` pointing to VikingFS temp. @@ -103,6 +105,7 @@ async def parse( include=kwargs.get("include"), exclude=kwargs.get("exclude"), ) + directly_upload_media = kwargs.get("directly_upload_media", True) processable_files = scan_result.all_processable_files() warnings.extend(scan_result.warnings) @@ -137,13 +140,35 @@ async def parse( for cf in processable_files: file_parser = self._assign_parser(cf, registry) parser_name = type(file_parser).__name__ if file_parser else "direct" - ok = await self._process_single_file( - cf, - file_parser, - target_uri, - viking_fs, - warnings, - ) + + # Check if this is a media parser and we should directly upload + is_media_parser = file_parser and parser_name in [ + "ImageParser", + "AudioParser", + "VideoParser", + ] + ext = Path(cf.path).suffix.lower() + is_media_file = ext in MEDIA_EXTENSIONS + + if directly_upload_media and is_media_parser and is_media_file: + # Directly upload media file without using media parser + ok = await self._upload_file_directly( + cf, + target_uri, + viking_fs, + warnings, + ) + parser_name = "direct_upload" + else: + # Normal processing with parser + ok = await self._process_single_file( + cf, + file_parser, + target_uri, + viking_fs, + warnings, + ) + if ok: file_count += 1 processed_files.append( @@ -332,6 +357,32 @@ async def _process_single_file( warnings.append(f"Failed to upload {rel_path}: {exc}") return False + @staticmethod + async def _upload_file_directly( + classified_file: "ClassifiedFile", + target_uri: str, + viking_fs: Any, + warnings: List[str], + ) -> bool: + """Directly upload a file without using its parser. + + Used for media files when directly_upload_media=True. + + Returns: + *True* on success, *False* on failure. + """ + rel_path = classified_file.rel_path + src_file = classified_file.path + + try: + content = src_file.read_bytes() + dst_uri = f"{target_uri}/{rel_path}" + await viking_fs.write_file(dst_uri, content) + return True + except Exception as exc: + warnings.append(f"Failed to upload {rel_path}: {exc}") + return False + # ------------------------------------------------------------------ # VikingFS merge helpers # ------------------------------------------------------------------ diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py index f0a018c3..e9473658 100644 --- a/openviking/parse/parsers/media/audio.py +++ b/openviking/parse/parsers/media/audio.py @@ -29,6 +29,7 @@ from openviking.parse.base import NodeType, ParseResult, ResourceNode from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.media.constants import AUDIO_EXTENSIONS from openviking_cli.utils.config.parser_config import AudioConfig @@ -50,7 +51,7 @@ def __init__(self, config: Optional[AudioConfig] = None, **kwargs): @property def supported_extensions(self) -> List[str]: """Return supported audio file extensions.""" - return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] + return AUDIO_EXTENSIONS async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ diff --git a/openviking/parse/parsers/media/constants.py b/openviking/parse/parsers/media/constants.py new file mode 100644 index 00000000..f8bc96cf --- /dev/null +++ b/openviking/parse/parsers/media/constants.py @@ -0,0 +1,15 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Constants for media parsers.""" + +# Image extensions supported by ImageParser +IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] + +# Audio extensions supported by AudioParser +AUDIO_EXTENSIONS = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] + +# Video extensions supported by VideoParser +VIDEO_EXTENSIONS = [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"] + +# All media extensions combined +MEDIA_EXTENSIONS = set(IMAGE_EXTENSIONS + AUDIO_EXTENSIONS + VIDEO_EXTENSIONS) diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py index c82b9589..059735d1 100644 --- a/openviking/parse/parsers/media/image.py +++ b/openviking/parse/parsers/media/image.py @@ -17,7 +17,12 @@ from openviking.parse.base import NodeType, ParseResult, ResourceNode from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.media.constants import IMAGE_EXTENSIONS +from openviking.storage.viking_fs import get_viking_fs +from openviking_cli.utils.config import get_openviking_config from openviking_cli.utils.config.parser_config import ImageConfig +from openviking_cli.utils.logger import get_logger +from openviking_cli.utils.uri import VikingURI # ============================================================================= # Configuration Classes @@ -62,7 +67,7 @@ def __init__(self, config: Optional[ImageConfig] = None, **kwargs): @property def supported_extensions(self) -> List[str]: """Return supported image file extensions.""" - return [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] + return IMAGE_EXTENSIONS async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ @@ -91,7 +96,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) FileNotFoundError: If source file does not exist IOError: If image processing fails """ - from openviking.storage.viking_fs import get_viking_fs # Convert to Path object file_path = Path(source) if isinstance(source, str) else source @@ -105,8 +109,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) image_bytes = file_path.read_bytes() ext = file_path.suffix - from openviking_cli.utils.uri import VikingURI - # Sanitize original filename (replace spaces with underscores) original_filename = file_path.name.replace(" ", "_") # Root directory name: filename stem + _ + extension (without dot) @@ -165,7 +167,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) ) # Phase 2: Generate semantic info - await self._generate_semantic_info(root_node, description, viking_fs, ocr_text is not None) + await self._generate_semantic_info( + root_node, description, viking_fs, ocr_text is not None, root_dir_uri + ) # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( @@ -187,11 +191,34 @@ async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str: Returns: Image description in markdown format - - TODO: Integrate with actual VLM API (OpenAI GPT-4V, Claude Vision, etc.) """ - # Fallback implementation - returns basic placeholder - return "Image description (VLM integration pending)\n\nThis is an image. VLM description feature has not yet integrated external API." + from openviking.prompts import render_prompt + + logger = get_logger(__name__) + + try: + vlm = get_openviking_config().vlm + + # Render prompt + prompt = render_prompt( + "parsing.image_summary", + { + "context": "No additional context", + }, + ) + + # Call VLM + response = await vlm.get_vision_completion_async( + prompt=prompt, + images=[image_bytes], + ) + + return response.strip() + + except Exception as e: + logger.error(f"Error in VLM image description: {e}") + # Fallback to basic description + return "Image description (VLM integration failed)\n\nThis is an image file." async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]: """ @@ -210,16 +237,17 @@ async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]: return None async def _generate_semantic_info( - self, node: ResourceNode, description: str, viking_fs, has_ocr: bool + self, node: ResourceNode, description: str, viking_fs, has_ocr: bool, root_dir_uri: str ): """ - Phase 2: Generate abstract and overview. + Phase 2: Generate abstract and overview and write to .abstract.md and .overview.md. Args: node: ResourceNode to update description: Image description viking_fs: VikingFS instance has_ocr: Whether OCR file exists + root_dir_uri: Root directory URI to write semantic files """ # Generate abstract (short summary, < 100 tokens) abstract = description[:200] if len(description) > 200 else description @@ -265,6 +293,10 @@ async def _generate_semantic_info( node.meta["abstract"] = abstract node.meta["overview"] = overview + # Write to files in temp directory + await viking_fs.write_file(f"{root_dir_uri}/.abstract.md", abstract) + await viking_fs.write_file(f"{root_dir_uri}/.overview.md", overview) + async def parse_content( self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs ) -> ParseResult: diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py index 53cccf67..84ee468f 100644 --- a/openviking/parse/parsers/media/video.py +++ b/openviking/parse/parsers/media/video.py @@ -29,6 +29,7 @@ from openviking.parse.base import NodeType, ParseResult, ResourceNode from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.media.constants import VIDEO_EXTENSIONS from openviking_cli.utils.config.parser_config import VideoConfig @@ -50,7 +51,7 @@ def __init__(self, config: Optional[VideoConfig] = None, **kwargs): @property def supported_extensions(self) -> List[str]: """Return supported video file extensions.""" - return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"] + return VIDEO_EXTENSIONS async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py index 378bcd7d..ac51fec9 100644 --- a/openviking/parse/registry.py +++ b/openviking/parse/registry.py @@ -8,16 +8,19 @@ import logging from pathlib import Path -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from openviking.parse.base import ParseResult from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.code import CodeRepositoryParser +from openviking.parse.parsers.directory import DirectoryParser from openviking.parse.parsers.epub import EPubParser from openviking.parse.parsers.excel import ExcelParser # Import will be handled dynamically to avoid dependency issues from openviking.parse.parsers.html import HTMLParser from openviking.parse.parsers.markdown import MarkdownParser +from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser from openviking.parse.parsers.pdf import PDFParser from openviking.parse.parsers.powerpoint import PowerPointParser from openviking.parse.parsers.text import TextParser @@ -25,6 +28,8 @@ # Import markitdown-inspired parsers from openviking.parse.parsers.word import WordParser from openviking.parse.parsers.zip_parser import ZipParser +from openviking_cli.utils.config import get_openviking_config +from openviking_cli.utils.config.parser_config import load_parser_configs_from_dict if TYPE_CHECKING: from openviking.parse.custom import CustomParserProtocol @@ -34,27 +39,35 @@ class ParserRegistry: """ - Registry for document parsers. + Registry for document parsers, which is a singleton. Automatically selects appropriate parser based on file extension. """ - def __init__(self, register_optional: bool = True): + def __init__( + self, register_optional: bool = True, parser_configs: Optional[Dict[str, Any]] = None + ): """ Initialize registry with default parsers. Args: register_optional: Whether to register optional parsers that require extra dependencies + parser_configs: Dictionary of parser configurations (from load_parser_configs_from_dict) """ self._parsers: Dict[str, BaseParser] = {} self._extension_map: Dict[str, str] = {} + # Get parser configs + self._parser_configs = parser_configs or {} + config = get_openviking_config() + self._parser_configs = load_parser_configs_from_dict(config.parsers) + # Register core parsers - self.register("text", TextParser()) - self.register("markdown", MarkdownParser()) - self.register("pdf", PDFParser()) - self.register("html", HTMLParser()) + self.register("text", TextParser(config=self._parser_configs.get("text"))) + self.register("markdown", MarkdownParser(config=self._parser_configs.get("markdown"))) + self.register("pdf", PDFParser(config=self._parser_configs.get("pdf"))) + self.register("html", HTMLParser()) # HTMLParser doesn't accept config yet # Register markitdown-inspired parsers (built-in) self.register("word", WordParser()) @@ -62,38 +75,12 @@ def __init__(self, register_optional: bool = True): self.register("excel", ExcelParser()) self.register("epub", EPubParser()) self.register("zip", ZipParser()) + self.register("code", CodeRepositoryParser()) + self.register("directory", DirectoryParser()) - # Register code parser dynamically - try: - from openviking.parse.parsers.code import CodeRepositoryParser - - self.register("code", CodeRepositoryParser()) - except ImportError as e: - logger.warning(f"CodeRepositoryParser not available: {e}") - - # Register directory parser - try: - from openviking.parse.parsers.directory import DirectoryParser - - self.register("directory", DirectoryParser()) - except ImportError as e: - logger.warning(f"DirectoryParser not available: {e}") - - # Register optional media parsers - if register_optional: - try: - from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser - - self.register("image", ImageParser()) - logger.info("Registered ImageParser for image formats") - - self.register("audio", AudioParser()) - logger.info("Registered AudioParser for audio formats") - - self.register("video", VideoParser()) - logger.info("Registered VideoParser for video formats") - except ImportError as e: - logger.debug(f"Media parsers not registered: {e}") + self.register("image", ImageParser(config=self._parser_configs.get("image"))) + self.register("audio", AudioParser(config=self._parser_configs.get("audio"))) + self.register("video", VideoParser(config=self._parser_configs.get("video"))) def register(self, name: str, parser: BaseParser) -> None: """ @@ -290,11 +277,11 @@ def list_supported_extensions(self) -> List[str]: _default_registry: Optional[ParserRegistry] = None -def get_registry() -> ParserRegistry: +def get_registry(parser_configs: Optional[Dict[str, Any]] = None) -> ParserRegistry: """Get the default parser registry.""" global _default_registry if _default_registry is None: - _default_registry = ParserRegistry() + _default_registry = ParserRegistry(parser_configs=parser_configs) return _default_registry diff --git a/openviking/prompts/templates/parsing/image_summary.yaml b/openviking/prompts/templates/parsing/image_summary.yaml new file mode 100644 index 00000000..26bf3f0f --- /dev/null +++ b/openviking/prompts/templates/parsing/image_summary.yaml @@ -0,0 +1,31 @@ +metadata: + id: "parsing.image_summary" + name: "Image Summary" + description: "Generate concise image summary for semantic parsing" + version: "1.0.0" + language: "en" + category: "parsing" + +variables: + - name: "context" + type: "string" + description: "Additional context for image understanding" + default: "No additional context" + required: false + +template: | + Please analyze this image and generate a concise summary for semantic indexing. + + Context: {{ context }} + + Generate a comprehensive description that includes: + 1. What is in the image (main subjects, objects) + 2. What is happening or what the image depicts + 3. Any text visible in the image + 4. Key visual elements and their relationships + + Keep the description clear and detailed, suitable for semantic search and understanding. + +llm_config: + temperature: 0.0 + supports_vision: true diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 38da777a..5904d826 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -12,7 +12,9 @@ FILE_TYPE_CODE, FILE_TYPE_DOCUMENTATION, FILE_TYPE_OTHER, + IGNORE_EXTENSIONS, ) +from openviking.parse.parsers.upload_utils import is_text_file from openviking.prompts import render_prompt from openviking.storage.queuefs.named_queue import DequeueHandlerBase from openviking.storage.queuefs.semantic_dag import DagStats, SemanticDagExecutor @@ -289,10 +291,26 @@ async def _generate_single_file_summary( file_name = file_path.split("/")[-1] try: + # Check if this is a binary file that should be skipped + from pathlib import Path + + p = Path(file_name) + extension = p.suffix.lower() + + # Skip binary files (using IGNORE_EXTENSIONS as reference) + if extension in IGNORE_EXTENSIONS or not is_text_file(file_name): + logger.debug(f"Skipping binary file for summary generation: {file_path}") + return {"name": file_name, "summary": ""} + # Read file content (limit length) content = await viking_fs.read_file(file_path) if isinstance(content, bytes): - content = content.decode("utf-8") + # Try to decode with error handling for text files + try: + content = content.decode("utf-8") + except UnicodeDecodeError: + logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}") + return {"name": file_name, "summary": ""} # Limit content length (about 10000 tokens) max_chars = 30000 diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py index 14931b6e..079daf53 100644 --- a/openviking_cli/utils/config/parser_config.py +++ b/openviking_cli/utils/config/parser_config.py @@ -229,7 +229,7 @@ class ImageConfig(ParserConfig): Configuration for image parsing. Attributes: - enable_ocr: Whether to perform OCR text extraction + enable_ocr: Whether to perform OCR text extraction, not implemented enable_vlm: Whether to use VLM for visual understanding ocr_lang: Language for OCR (e.g., "chi_sim", "eng") vlm_model: VLM model to use (e.g., "gpt-4-vision") diff --git a/tests/parse/test_add_directory.py b/tests/parse/test_add_directory.py index 0402adf3..81fd650b 100644 --- a/tests/parse/test_add_directory.py +++ b/tests/parse/test_add_directory.py @@ -180,6 +180,18 @@ def tmp_mixed(tmp_path: Path) -> Path: return tmp_path +@pytest.fixture +def tmp_media_files(tmp_path: Path) -> Path: + """Directory with various media files and regular files.""" + (tmp_path / "docs.md").write_text("# Documentation", encoding="utf-8") + (tmp_path / "image.png").write_bytes(b"\x89PNG\r\n\x1a\n") + (tmp_path / "photo.jpg").write_bytes(b"\xff\xd8\xff") + (tmp_path / "audio.mp3").write_bytes(b"ID3") + (tmp_path / "video.mp4").write_bytes(b"\x00\x00\x00\x18ftyp") + (tmp_path / "script.js").write_text("console.log('test')", encoding="utf-8") + return tmp_path + + # --------------------------------------------------------------------------- # Tests: basic properties # --------------------------------------------------------------------------- @@ -461,3 +473,92 @@ async def test_result_fields(self, tmp_code: Path, parser, fake_fs) -> None: assert result.meta["dir_name"] == tmp_code.name assert result.meta["total_processable"] == 3 assert result.meta["file_count"] == 3 + + +# --------------------------------------------------------------------------- +# Tests: directly_upload_media parameter +# --------------------------------------------------------------------------- + + +class TestDirectlyUploadMedia: + """Test the directly_upload_media parameter behavior.""" + + @pytest.mark.asyncio + async def test_default_directly_upload_media_true(self, tmp_media_files: Path, fake_fs) -> None: + """Test that with directly_upload_media=True (default), media files are uploaded directly.""" + with patch.object(BaseParser, "_get_viking_fs", return_value=fake_fs): + parser = DirectoryParser() + await parser.parse(str(tmp_media_files)) + + uploaded_names = {uri.split("/")[-1] for uri in fake_fs.files} + + assert "docs.md" in uploaded_names + assert "image.png" in uploaded_names + assert "photo.jpg" in uploaded_names + assert "audio.mp3" in uploaded_names + assert "video.mp4" in uploaded_names + assert "script.js" in uploaded_names + + @pytest.mark.asyncio + async def test_directly_upload_media_false(self, tmp_media_files: Path, fake_fs) -> None: + """Test that with directly_upload_media=False, media files go through their parsers.""" + mock_image_result = create_parse_result( + root=ResourceNode(type=NodeType.ROOT), + source_path=str(tmp_media_files / "image.png"), + source_format="image", + parser_name="ImageParser", + parse_time=0.1, + ) + mock_image_result.temp_dir_path = fake_fs.create_temp_uri() + + mock_audio_result = create_parse_result( + root=ResourceNode(type=NodeType.ROOT), + source_path=str(tmp_media_files / "audio.mp3"), + source_format="audio", + parser_name="AudioParser", + parse_time=0.1, + ) + mock_audio_result.temp_dir_path = fake_fs.create_temp_uri() + + mock_video_result = create_parse_result( + root=ResourceNode(type=NodeType.ROOT), + source_path=str(tmp_media_files / "video.mp4"), + source_format="video", + parser_name="VideoParser", + parse_time=0.1, + ) + mock_video_result.temp_dir_path = fake_fs.create_temp_uri() + + with patch.object(BaseParser, "_get_viking_fs", return_value=fake_fs): + parser = DirectoryParser() + + with patch.object(parser, "_assign_parser") as mock_assign: + from openviking.parse.parsers.media.audio import AudioParser + from openviking.parse.parsers.media.image import ImageParser + from openviking.parse.parsers.media.video import VideoParser + + mock_image = AsyncMock(spec=ImageParser) + mock_image.parse = AsyncMock(return_value=mock_image_result) + + mock_audio = AsyncMock(spec=AudioParser) + mock_audio.parse = AsyncMock(return_value=mock_audio_result) + + mock_video = AsyncMock(spec=VideoParser) + mock_video.parse = AsyncMock(return_value=mock_video_result) + + def assign_side_effect(cf, registry): + if cf.path.suffix in {".png", ".jpg"}: + return mock_image + elif cf.path.suffix in {".mp3"}: + return mock_audio + elif cf.path.suffix in {".mp4"}: + return mock_video + return registry.get_parser_for_file(cf.path) + + mock_assign.side_effect = assign_side_effect + + await parser.parse(str(tmp_media_files), directly_upload_media=False) + + assert mock_image.parse.call_count == 2 + mock_audio.parse.assert_called_once() + mock_video.parse.assert_called_once() From 0974f3b780341662699707a8e65a7c19195833ca Mon Sep 17 00:00:00 2001 From: openviking Date: Thu, 19 Feb 2026 17:16:16 +0800 Subject: [PATCH 12/18] feat: vlm optimization for image --- openviking/parse/registry.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py index ac51fec9..5777edec 100644 --- a/openviking/parse/registry.py +++ b/openviking/parse/registry.py @@ -8,7 +8,7 @@ import logging from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union from openviking.parse.base import ParseResult from openviking.parse.parsers.base_parser import BaseParser @@ -28,8 +28,6 @@ # Import markitdown-inspired parsers from openviking.parse.parsers.word import WordParser from openviking.parse.parsers.zip_parser import ZipParser -from openviking_cli.utils.config import get_openviking_config -from openviking_cli.utils.config.parser_config import load_parser_configs_from_dict if TYPE_CHECKING: from openviking.parse.custom import CustomParserProtocol @@ -44,9 +42,7 @@ class ParserRegistry: Automatically selects appropriate parser based on file extension. """ - def __init__( - self, register_optional: bool = True, parser_configs: Optional[Dict[str, Any]] = None - ): + def __init__(self, register_optional: bool = True): """ Initialize registry with default parsers. @@ -58,15 +54,10 @@ def __init__( self._parsers: Dict[str, BaseParser] = {} self._extension_map: Dict[str, str] = {} - # Get parser configs - self._parser_configs = parser_configs or {} - config = get_openviking_config() - self._parser_configs = load_parser_configs_from_dict(config.parsers) - # Register core parsers - self.register("text", TextParser(config=self._parser_configs.get("text"))) - self.register("markdown", MarkdownParser(config=self._parser_configs.get("markdown"))) - self.register("pdf", PDFParser(config=self._parser_configs.get("pdf"))) + self.register("text", TextParser()) + self.register("markdown", MarkdownParser()) + self.register("pdf", PDFParser()) self.register("html", HTMLParser()) # HTMLParser doesn't accept config yet # Register markitdown-inspired parsers (built-in) @@ -78,9 +69,9 @@ def __init__( self.register("code", CodeRepositoryParser()) self.register("directory", DirectoryParser()) - self.register("image", ImageParser(config=self._parser_configs.get("image"))) - self.register("audio", AudioParser(config=self._parser_configs.get("audio"))) - self.register("video", VideoParser(config=self._parser_configs.get("video"))) + self.register("image", ImageParser()) + self.register("audio", AudioParser()) + self.register("video", VideoParser()) def register(self, name: str, parser: BaseParser) -> None: """ @@ -277,11 +268,11 @@ def list_supported_extensions(self) -> List[str]: _default_registry: Optional[ParserRegistry] = None -def get_registry(parser_configs: Optional[Dict[str, Any]] = None) -> ParserRegistry: +def get_registry() -> ParserRegistry: """Get the default parser registry.""" global _default_registry if _default_registry is None: - _default_registry = ParserRegistry(parser_configs=parser_configs) + _default_registry = ParserRegistry() return _default_registry From 25d52b98fe02932059d021614c9751b49d82d276 Mon Sep 17 00:00:00 2001 From: openviking Date: Fri, 20 Feb 2026 12:14:45 +0800 Subject: [PATCH 13/18] refactor: move media content understanding to SemanticProcessor - Add parse/parsers/media/utils.py with media helpers - Refactor ImageParser.parse(), AudioParser.parse(), VideoParser.parse() to remove content understanding, keep only metadata extraction - Update SemanticProcessor._generate_single_file_summary() to handle media types and call media utils for summary generation - Update TreeBuilder._get_base_uri() to use media utils - Update ResourceNode.get_abstract() and get_overview() to check meta for abstract/overview - Add debug logs and error handling --- openviking/parse/base.py | 8 +- openviking/parse/parsers/media/__init__.py | 3 +- openviking/parse/parsers/media/audio.py | 38 +---- openviking/parse/parsers/media/image.py | 59 ++----- openviking/parse/parsers/media/utils.py | 145 ++++++++++++++++++ openviking/parse/parsers/media/video.py | 35 +---- openviking/parse/tree_builder.py | 77 +++------- .../storage/queuefs/semantic_processor.py | 38 ++++- openviking/utils/resource_processor.py | 1 + 9 files changed, 232 insertions(+), 172 deletions(-) create mode 100644 openviking/parse/parsers/media/utils.py diff --git a/openviking/parse/base.py b/openviking/parse/base.py index 9548f4b4..91eec900 100644 --- a/openviking/parse/base.py +++ b/openviking/parse/base.py @@ -254,7 +254,7 @@ def get_text(self, include_children: bool = True) -> str: texts.append(child.get_text(include_children=True)) return "\n".join(texts) - def get_abstract(self, max_length: int = 200) -> str: + def get_abstract(self, max_length: int = 256) -> str: """ Generate L0 abstract for this node. @@ -264,6 +264,8 @@ def get_abstract(self, max_length: int = 200) -> str: Returns: Abstract text """ + if "abstract" in self.meta: + return self.meta["abstract"] if self.title: abstract = self.title else: @@ -285,8 +287,10 @@ def get_overview(self, max_length: int = 4000) -> str: Returns: Overview text including structure summary """ + if "overview" in self.meta: + return self.meta["overview"] + # Default overview generation parts = [] - if self.title: parts.append(f"**{self.title}**") diff --git a/openviking/parse/parsers/media/__init__.py b/openviking/parse/parsers/media/__init__.py index 7fed46b5..9f58977f 100644 --- a/openviking/parse/parsers/media/__init__.py +++ b/openviking/parse/parsers/media/__init__.py @@ -3,6 +3,7 @@ from .audio import AudioParser from .image import ImageParser +from .utils import get_media_base_uri, get_media_type from .video import VideoParser -__all__ = ["ImageParser", "AudioParser", "VideoParser"] +__all__ = ["ImageParser", "AudioParser", "VideoParser", "get_media_type", "get_media_base_uri"] diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py index e9473658..3caeb44a 100644 --- a/openviking/parse/parsers/media/audio.py +++ b/openviking/parse/parsers/media/audio.py @@ -55,19 +55,7 @@ def supported_extensions(self) -> List[str]: async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ - Parse audio file using three-phase architecture. - - Phase 1: Generate temporary files - - Copy original audio to temp_uri/content.{ext} - - (Optional) Generate transcript with timestamps - - Phase 2: Generate semantic info - - Generate abstract and overview based on description - - Overview includes file list and usage instructions - - Phase 3: Build directory structure - - Move all files to final URI - - Generate .abstract.md, .overview.md + Parse audio file - only copy original file and extract basic metadata, no content understanding. Args: source: Audio file path @@ -140,24 +128,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) channels = 0 format_str = ext[1:].upper() - # 1.3 Generate ASR description - description = "" - if self.config.enable_transcription: - description = await self._asr_transcribe(audio_bytes, self.config.asr_model) - else: - # Fallback: basic description - description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)" - - # 1.4 Transcript with timestamps (optional) - transcript_text = None - if self.config.enable_transcription and self.config.enable_timestamps: - transcript_text = await self._asr_transcribe_with_timestamps( - audio_bytes, self.config.asr_model - ) - if transcript_text: - await viking_fs.write_file(f"{root_dir_uri}/transcript.md", transcript_text) - - # Create ResourceNode + # Create ResourceNode - metadata only, no content understanding yet root_node = ResourceNode( type=NodeType.ROOT, title=file_path.stem, @@ -177,11 +148,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) }, ) - # Phase 2: Generate semantic info - await self._generate_semantic_info( - root_node, description, viking_fs, transcript_text is not None - ) - # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( root=root_node, diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py index 059735d1..3442f26e 100644 --- a/openviking/parse/parsers/media/image.py +++ b/openviking/parse/parsers/media/image.py @@ -18,12 +18,15 @@ from openviking.parse.base import NodeType, ParseResult, ResourceNode from openviking.parse.parsers.base_parser import BaseParser from openviking.parse.parsers.media.constants import IMAGE_EXTENSIONS +from openviking.prompts import render_prompt from openviking.storage.viking_fs import get_viking_fs from openviking_cli.utils.config import get_openviking_config from openviking_cli.utils.config.parser_config import ImageConfig from openviking_cli.utils.logger import get_logger from openviking_cli.utils.uri import VikingURI +logger = get_logger(__name__) + # ============================================================================= # Configuration Classes # ============================================================================= @@ -71,19 +74,7 @@ def supported_extensions(self) -> List[str]: async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ - Parse image file using three-phase architecture. - - Phase 1: Generate temporary files - - Copy original image to temp_uri/content.{ext} - - (Optional) Generate ocr.md using OCR - - Phase 2: Generate semantic info - - Generate abstract and overview based on description - - Overview includes file list and usage instructions - - Phase 3: Build directory structure - - Move all files to final URI - - Generate .abstract.md, .overview.md + Parse image file - only copy original file and extract basic metadata, no content understanding. Args: source: Image file path @@ -96,7 +87,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) FileNotFoundError: If source file does not exist IOError: If image processing fails """ - # Convert to Path object file_path = Path(source) if isinstance(source, str) else source if not file_path.exists(): @@ -132,22 +122,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) except Exception as e: raise ValueError(f"Invalid image file: {file_path}. Error: {e}") from e - # 1.3 Generate VLM description - description = "" - if self.config.enable_vlm: - description = await self._vlm_describe(image_bytes, self.config.vlm_model) - else: - # Fallback: basic description - description = f"Image file: {file_path.name} ({format_str}, {width}x{height})" - - # 1.4 OCR (optional) - ocr_text = None - if self.config.enable_ocr: - ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang) - if ocr_text: - await viking_fs.write_file(f"{root_dir_uri}/ocr.md", ocr_text) - - # Create ResourceNode + # Create ResourceNode - metadata only, no content understanding yet root_node = ResourceNode( type=NodeType.ROOT, title=file_path.stem, @@ -166,11 +141,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) }, ) - # Phase 2: Generate semantic info - await self._generate_semantic_info( - root_node, description, viking_fs, ocr_text is not None, root_dir_uri - ) - # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( root=root_node, @@ -192,10 +162,6 @@ async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str: Returns: Image description in markdown format """ - from openviking.prompts import render_prompt - - logger = get_logger(__name__) - try: vlm = get_openviking_config().vlm @@ -206,17 +172,20 @@ async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str: "context": "No additional context", }, ) - - # Call VLM response = await vlm.get_vision_completion_async( prompt=prompt, images=[image_bytes], ) + logger.info( + f"[ImageParser._vlm_describe] VLM response received, length: {len(response)}, content: {response[:256]}" + ) return response.strip() except Exception as e: - logger.error(f"Error in VLM image description: {e}") + logger.error( + f"[ImageParser._vlm_describe] Error in VLM image description: {e}", exc_info=True + ) # Fallback to basic description return "Image description (VLM integration failed)\n\nThis is an image file." @@ -250,7 +219,7 @@ async def _generate_semantic_info( root_dir_uri: Root directory URI to write semantic files """ # Generate abstract (short summary, < 100 tokens) - abstract = description[:200] if len(description) > 200 else description + abstract = description[:253] + "..." if len(description) > 256 else description # Generate overview (content summary + file list + usage instructions) overview_parts = [ @@ -294,8 +263,8 @@ async def _generate_semantic_info( node.meta["overview"] = overview # Write to files in temp directory - await viking_fs.write_file(f"{root_dir_uri}/.abstract.md", abstract) - await viking_fs.write_file(f"{root_dir_uri}/.overview.md", overview) + # await viking_fs.write_file(f"{root_dir_uri}/.abstract.md", abstract) + # await viking_fs.write_file(f"{root_dir_uri}/.overview.md", overview) async def parse_content( self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs diff --git a/openviking/parse/parsers/media/utils.py b/openviking/parse/parsers/media/utils.py new file mode 100644 index 00000000..f4acbdf4 --- /dev/null +++ b/openviking/parse/parsers/media/utils.py @@ -0,0 +1,145 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Media-related utilities for OpenViking.""" + +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Optional + +from openviking.prompts import render_prompt +from openviking.storage.viking_fs import get_viking_fs +from openviking_cli.utils.config import get_openviking_config +from openviking_cli.utils.logger import get_logger + +from .constants import AUDIO_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS + +logger = get_logger(__name__) + + +def get_media_type(source_path: Optional[str], source_format: Optional[str]) -> Optional[str]: + """ + Determine media type from source path or format. + + Args: + source_path: Source file path + source_format: Source format string (e.g., "image", "audio", "video") + + Returns: + Media type ("image", "audio", "video") or None if not a media file + """ + if source_format: + if source_format in ["image", "audio", "video"]: + return source_format + + if source_path: + ext = Path(source_path).suffix.lower() + if ext in IMAGE_EXTENSIONS: + return "image" + elif ext in AUDIO_EXTENSIONS: + return "audio" + elif ext in VIDEO_EXTENSIONS: + return "video" + + return None + + +def get_media_base_uri(media_type: str) -> str: + """ + Get base URI for media files. + + Args: + media_type: Media type ("image", "audio", "video") + + Returns: + Base URI like "viking://resources/images/20250219" + """ + # Map singular media types to plural directory names + media_dir_map = {"image": "images", "audio": "audio", "video": "video"} + media_dir = media_dir_map.get(media_type, media_type) + # Get current date in YYYYMMDD format + date_str = datetime.now().strftime("%Y%m%d") + return f"viking://resources/{media_dir}/{date_str}" + + +async def generate_image_summary(image_uri: str, original_filename: str) -> Dict[str, Any]: + """ + Generate summary for an image file using VLM. + + Args: + image_uri: URI to the image file in VikingFS + original_filename: Original filename of the image + + Returns: + Dictionary with "name" and "summary" keys + """ + viking_fs = get_viking_fs() + vlm = get_openviking_config().vlm + file_name = original_filename + + try: + # Read image bytes + image_bytes = await viking_fs.read_file(image_uri) + if not isinstance(image_bytes, bytes): + raise ValueError(f"Expected bytes for image file, got {type(image_bytes)}") + + logger.info( + f"[MediaUtils.generate_image_summary] Generating summary for image: {image_uri}" + ) + + # Render prompt + prompt = render_prompt( + "parsing.image_summary", + {"context": "No additional context"}, + ) + + # Call VLM + response = await vlm.get_vision_completion_async( + prompt=prompt, + images=[image_bytes], + ) + + logger.info( + f"[MediaUtils.generate_image_summary] VLM response received, length: {len(response)}" + ) + return {"name": file_name, "summary": response.strip()} + + except Exception as e: + logger.error( + f"[MediaUtils.generate_image_summary] Failed to generate image summary: {e}", + exc_info=True, + ) + return {"name": file_name, "summary": "Image summary generation failed"} + + +async def generate_audio_summary(audio_uri: str, original_filename: str) -> Dict[str, Any]: + """ + Generate summary for an audio file (placeholder). + + Args: + audio_uri: URI to the audio file in VikingFS + original_filename: Original filename of the audio + + Returns: + Dictionary with "name" and "summary" keys + """ + logger.info( + f"[MediaUtils.generate_audio_summary] Audio summary generation not yet implemented for: {audio_uri}" + ) + return {"name": original_filename, "summary": "Audio summary generation not yet implemented"} + + +async def generate_video_summary(video_uri: str, original_filename: str) -> Dict[str, Any]: + """ + Generate summary for a video file (placeholder). + + Args: + video_uri: URI to the video file in VikingFS + original_filename: Original filename of the video + + Returns: + Dictionary with "name" and "summary" keys + """ + logger.info( + f"[MediaUtils.generate_video_summary] Video summary generation not yet implemented for: {video_uri}" + ) + return {"name": original_filename, "summary": "Video summary generation not yet implemented"} diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py index 84ee468f..a0a07602 100644 --- a/openviking/parse/parsers/media/video.py +++ b/openviking/parse/parsers/media/video.py @@ -55,20 +55,7 @@ def supported_extensions(self) -> List[str]: async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ - Parse video file using three-phase architecture. - - Phase 1: Generate temporary files - - Copy original video to temp_uri/content.{ext} - - Extract key frames - - Extract audio track and transcribe using ASR - - Phase 2: Generate semantic info - - Generate abstract and overview based on descriptions - - Overview includes file list and usage instructions - - Phase 3: Build directory structure - - Move all files to final URI - - Generate .abstract.md, .overview.md + Parse video file - only copy original file and extract basic metadata, no content understanding. Args: source: Video file path @@ -142,22 +129,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) fps = 0 format_str = ext[1:].upper() - # 1.3 Generate combined description - description = "" - if self.config.enable_key_frames or self.config.enable_audio_transcription: - description = await self._generate_video_description(file_path, self.config) - else: - # Fallback: basic description - description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)" - - # 1.4 Key frames (optional) - key_frames_dir = f"{root_dir_uri}/keyframes" - has_key_frames = False - if self.config.enable_key_frames: - await viking_fs.mkdir(key_frames_dir) - has_key_frames = True - - # Create ResourceNode + # Create ResourceNode - metadata only, no content understanding yet root_node = ResourceNode( type=NodeType.ROOT, title=file_path.stem, @@ -178,9 +150,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) }, ) - # Phase 2: Generate semantic info - await self._generate_semantic_info(root_node, description, viking_fs, has_key_frames) - # Phase 3: Build directory structure (handled by TreeBuilder) return ParseResult( root=root_node, diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 0e582dab..3310794b 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -24,6 +24,8 @@ from typing import TYPE_CHECKING, Optional from openviking.core.building_tree import BuildingTree +from openviking.parse.parsers.media.utils import get_media_base_uri, get_media_type +from openviking.storage.queuefs import SemanticMsg, get_queue_manager from openviking.storage.viking_fs import get_viking_fs from openviking_cli.utils.uri import VikingURI @@ -58,10 +60,15 @@ def __init__(self): """Initialize TreeBuilder.""" pass - def _get_base_uri(self, scope: str) -> str: - """Get base URI for scope.""" - # Resources are now in independent resources scope + def _get_base_uri( + self, scope: str, source_path: Optional[str] = None, source_format: Optional[str] = None + ) -> str: + """Get base URI for scope, with special handling for media files.""" + # Check if it's a media file first if scope == "resources": + media_type = get_media_type(source_path, source_format) + if media_type: + return get_media_base_uri(media_type) return "viking://resources" if scope == "user": # user resources go to memories (no separate resources dir) @@ -93,6 +100,7 @@ async def finalize_from_temp( temp_dir_path: Temporary directory Viking URI (e.g., viking://temp/xxx) scope: Scope ("resources", "user", or "agent") base_uri: Base URI (None = use scope default) + source_node: Source ResourceNode source_path: Source file path source_format: Source file format @@ -115,70 +123,38 @@ async def finalize_from_temp( f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}" ) - from openviking_cli.utils.uri import VikingURI - doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"]) - doc_uri = f"{temp_uri}/{doc_name}" + temp_doc_uri = f"{temp_uri}/{doc_name}" # 2. Determine base_uri if base_uri is None: - # Check if it's a media file (image/audio/video) - media_type = None - if source_format: - if source_format in ["image", "audio", "video"]: - media_type = source_format - elif source_path: - from pathlib import Path - - ext = Path(source_path).suffix.lower() - image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] - audio_exts = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] - video_exts = [".mp4", ".mov", ".avi", ".webm", ".mkv"] - if ext in image_exts: - media_type = "image" - elif ext in audio_exts: - media_type = "audio" - elif ext in video_exts: - media_type = "video" - - if media_type: - # Map singular media types to plural directory names - media_dir_map = {"image": "images", "audio": "audio", "video": "video"} - media_dir = media_dir_map.get(media_type, media_type) - # Get current date in YYYYMMDD format - from datetime import datetime - - date_str = datetime.now().strftime("%Y%m%d") - base_uri = f"viking://resources/{media_dir}/{date_str}" - else: - base_uri = self._get_base_uri(scope) - - logger.info(f"Finalizing from temp: {temp_uri} -> {base_uri}") + base_uri = self._get_base_uri(scope, source_path, source_format) # 3. Build final URI, auto-renaming on conflict (e.g. doc_1, doc_2, ...) candidate_uri = VikingURI(base_uri).join(doc_name).uri final_uri = await self._resolve_unique_uri(candidate_uri) if final_uri != candidate_uri: - logger.info(f"Resolved name conflict: {candidate_uri} -> {final_uri}") + logger.info(f"[TreeBuilder] Resolved name conflict: {candidate_uri} -> {final_uri}") + else: + logger.info(f"[TreeBuilder] Finalizing from temp: {final_uri}") # 4. Move directory tree from temp to final location in AGFS - await self._move_directory_in_agfs(doc_uri, final_uri) - logger.info(f"Moved temp tree: {doc_uri} -> {final_uri}") + await self._move_directory_in_agfs(temp_doc_uri, final_uri) + logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}") # 5. Cleanup temporary root directory try: await viking_fs.delete_temp(temp_uri) - logger.info(f"Cleaned up temp root: {temp_uri}") + logger.info(f"[TreeBuilder] Cleaned up temp root: {temp_uri}") except Exception as e: - logger.warning(f"Failed to cleanup temp root: {e}") + logger.warning(f"[TreeBuilder] Failed to cleanup temp root: {e}") # 6. Enqueue to SemanticQueue for async semantic generation try: - context_type = "resource" # Default to resource - await self._enqueue_semantic_generation(final_uri, context_type) - logger.info(f"Enqueued semantic generation for: {final_uri}") + await self._enqueue_semantic_generation(final_uri, scope) + logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}") except Exception as e: - logger.error(f"Failed to enqueue semantic generation: {e}", exc_info=True) + logger.error(f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True) # 7. Return simple BuildingTree (no scanning needed) tree = BuildingTree( @@ -187,8 +163,6 @@ async def finalize_from_temp( ) tree._root_uri = final_uri - logger.info(f"Finalized tree: root_uri={final_uri}") - return tree async def _resolve_unique_uri(self, uri: str, max_attempts: int = 100) -> str: @@ -215,9 +189,7 @@ async def _exists(u: str) -> bool: if not await _exists(candidate): return candidate - raise FileExistsError( - f"Cannot resolve unique name for {uri} after {max_attempts} attempts" - ) + raise FileExistsError(f"Cannot resolve unique name for {uri} after {max_attempts} attempts") async def _move_directory_in_agfs(self, src_uri: str, dst_uri: str) -> None: """Recursively move AGFS directory tree (copy + delete).""" @@ -280,7 +252,6 @@ async def _enqueue_semantic_generation(self, uri: str, context_type: str) -> Non uri: Directory URI to enqueue context_type: resource/memory/skill """ - from openviking.storage.queuefs import SemanticMsg, get_queue_manager queue_manager = get_queue_manager() diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 5904d826..09a3ec77 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -291,12 +291,46 @@ async def _generate_single_file_summary( file_name = file_path.split("/")[-1] try: - # Check if this is a binary file that should be skipped + # Check if this is a media file first from pathlib import Path + from openviking.parse.parsers.media.utils import ( + generate_audio_summary, + generate_image_summary, + generate_video_summary, + get_media_type, + ) + p = Path(file_name) extension = p.suffix.lower() + # Check media type + media_type = get_media_type(file_name, None) + if media_type: + logger.info( + f"[SemanticProcessor] Generating media summary for: {file_path}, type: {media_type}" + ) + # Find the original filename by listing the directory (since file_path is like viking://resources/images/xxx/xxx.png) + parent_uri = "/".join(file_path.split("/")[:-1]) + try: + entries = await viking_fs.ls(parent_uri) + original_filename = file_name # default to file_name + for entry in entries: + name = entry.get("name", "") + if name and not name.startswith(".") and not entry.get("isDir"): + original_filename = name + break + except Exception: + original_filename = file_name + + if media_type == "image": + return await generate_image_summary(file_path, original_filename) + elif media_type == "audio": + return await generate_audio_summary(file_path, original_filename) + elif media_type == "video": + return await generate_video_summary(file_path, original_filename) + + # Check if this is a binary file that should be skipped # Skip binary files (using IGNORE_EXTENSIONS as reference) if extension in IGNORE_EXTENSIONS or not is_text_file(file_name): logger.debug(f"Skipping binary file for summary generation: {file_path}") @@ -345,7 +379,7 @@ async def _generate_single_file_summary( return {"name": file_name, "summary": summary.strip()} except Exception as e: - logger.warning(f"Failed to generate summary for {file_path}: {e}") + logger.warning(f"Failed to generate summary for {file_path}: {e}", exc_info=True) return {"name": file_name, "summary": ""} def _extract_abstract_from_overview(self, overview_content: str) -> str: diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index 156c48dd..ff240d98 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -119,6 +119,7 @@ async def process_resource( except Exception as e: result["status"] = "error" result["errors"].append(f"Parse error: {e}") + logger.error(f"[ResourceProcessor] Parse error: {e}") return result # parse_result contains: From 4590c0286a9dcd5a1a35cbdeecb002c42f99e791 Mon Sep 17 00:00:00 2001 From: openviking Date: Fri, 20 Feb 2026 13:04:11 +0800 Subject: [PATCH 14/18] refactor: split _generate_single_file_summary to add _generate_text_summary - Add _generate_text_summary function for text file processing - Update media utils functions to accept llm_sem and use it to limit concurrent calls - Update _generate_single_file_summary to call _generate_text_summary and media utils functions - Fix import ordering - Fix issue where _generate_file_summaries was creating a new semaphore, now each _generate_single_file_summary handles its own --- openviking/parse/parsers/media/utils.py | 22 ++-- .../storage/queuefs/semantic_processor.py | 119 +++++++++--------- 2 files changed, 76 insertions(+), 65 deletions(-) diff --git a/openviking/parse/parsers/media/utils.py b/openviking/parse/parsers/media/utils.py index f4acbdf4..d89522e4 100644 --- a/openviking/parse/parsers/media/utils.py +++ b/openviking/parse/parsers/media/utils.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """Media-related utilities for OpenViking.""" +import asyncio from datetime import datetime from pathlib import Path from typing import Any, Dict, Optional @@ -61,7 +62,9 @@ def get_media_base_uri(media_type: str) -> str: return f"viking://resources/{media_dir}/{date_str}" -async def generate_image_summary(image_uri: str, original_filename: str) -> Dict[str, Any]: +async def generate_image_summary( + image_uri: str, original_filename: str, llm_sem: Optional[asyncio.Semaphore] = None +) -> Dict[str, Any]: """ Generate summary for an image file using VLM. @@ -93,10 +96,11 @@ async def generate_image_summary(image_uri: str, original_filename: str) -> Dict ) # Call VLM - response = await vlm.get_vision_completion_async( - prompt=prompt, - images=[image_bytes], - ) + async with llm_sem or asyncio.Semaphore(1): + response = await vlm.get_vision_completion_async( + prompt=prompt, + images=[image_bytes], + ) logger.info( f"[MediaUtils.generate_image_summary] VLM response received, length: {len(response)}" @@ -111,7 +115,9 @@ async def generate_image_summary(image_uri: str, original_filename: str) -> Dict return {"name": file_name, "summary": "Image summary generation failed"} -async def generate_audio_summary(audio_uri: str, original_filename: str) -> Dict[str, Any]: +async def generate_audio_summary( + audio_uri: str, original_filename: str, llm_sem: Optional[asyncio.Semaphore] = None +) -> Dict[str, Any]: """ Generate summary for an audio file (placeholder). @@ -128,7 +134,9 @@ async def generate_audio_summary(audio_uri: str, original_filename: str) -> Dict return {"name": original_filename, "summary": "Audio summary generation not yet implemented"} -async def generate_video_summary(video_uri: str, original_filename: str) -> Dict[str, Any]: +async def generate_video_summary( + video_uri: str, original_filename: str, llm_sem: Optional[asyncio.Semaphore] = None +) -> Dict[str, Any]: """ Generate summary for a video file (placeholder). diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 09a3ec77..53e77738 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -3,6 +3,7 @@ """SemanticProcessor: Processes messages from SemanticQueue, generates .abstract.md and .overview.md.""" import asyncio +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from openviking.core.context import Context, ResourceContentType, Vectorize @@ -14,6 +15,12 @@ FILE_TYPE_OTHER, IGNORE_EXTENSIONS, ) +from openviking.parse.parsers.media.utils import ( + generate_audio_summary, + generate_image_summary, + generate_video_summary, + get_media_type, +) from openviking.parse.parsers.upload_utils import is_text_file from openviking.prompts import render_prompt from openviking.storage.queuefs.named_queue import DequeueHandlerBase @@ -252,11 +259,8 @@ async def _generate_file_summaries( if not file_paths: return [] - sem = asyncio.Semaphore(self.max_concurrent_llm) - async def generate_one_summary(file_path: str) -> Dict[str, str]: - async with sem: - summary = await self._generate_single_file_summary(file_path) + summary = await self._generate_single_file_summary(file_path) if enqueue_files and context_type and parent_uri: try: await self._vectorize_single_file( @@ -275,6 +279,52 @@ async def generate_one_summary(file_path: str) -> Dict[str, str]: tasks = [generate_one_summary(fp) for fp in file_paths] return await asyncio.gather(*tasks) + async def _generate_text_summary( + self, file_path: str, file_name: str, llm_sem: asyncio.Semaphore + ) -> Dict[str, str]: + """Generate summary for a single text file (code, documentation, or other text).""" + viking_fs = get_viking_fs() + vlm = get_openviking_config().vlm + + # Read file content (limit length) + content = await viking_fs.read_file(file_path) + if isinstance(content, bytes): + # Try to decode with error handling for text files + try: + content = content.decode("utf-8") + except UnicodeDecodeError: + logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}") + return {"name": file_name, "summary": ""} + + # Limit content length (about 10000 tokens) + max_chars = 30000 + if len(content) > max_chars: + content = content[:max_chars] + "\n...(truncated)" + + # Generate summary + if not vlm.is_available(): + logger.warning("VLM not available, using empty summary") + return {"name": file_name, "summary": ""} + + # Detect file type and select appropriate prompt + file_type = self._detect_file_type(file_name) + + if file_type == FILE_TYPE_CODE: + prompt_id = "semantic.code_summary" + elif file_type == FILE_TYPE_DOCUMENTATION: + prompt_id = "semantic.document_summary" + else: + prompt_id = "semantic.file_summary" + + prompt = render_prompt( + prompt_id, + {"file_name": file_name, "content": content}, + ) + + async with llm_sem: + summary = await vlm.get_completion_async(prompt) + return {"name": file_name, "summary": summary.strip()} + async def _generate_single_file_summary( self, file_path: str, llm_sem: Optional[asyncio.Semaphore] = None ) -> Dict[str, str]: @@ -287,20 +337,12 @@ async def _generate_single_file_summary( {"name": file_name, "summary": summary_content} """ viking_fs = get_viking_fs() - vlm = get_openviking_config().vlm file_name = file_path.split("/")[-1] + llm_sem = llm_sem or asyncio.Semaphore(self.max_concurrent_llm) + try: # Check if this is a media file first - from pathlib import Path - - from openviking.parse.parsers.media.utils import ( - generate_audio_summary, - generate_image_summary, - generate_video_summary, - get_media_type, - ) - p = Path(file_name) extension = p.suffix.lower() @@ -324,11 +366,11 @@ async def _generate_single_file_summary( original_filename = file_name if media_type == "image": - return await generate_image_summary(file_path, original_filename) + return await generate_image_summary(file_path, original_filename, llm_sem) elif media_type == "audio": - return await generate_audio_summary(file_path, original_filename) + return await generate_audio_summary(file_path, original_filename, llm_sem) elif media_type == "video": - return await generate_video_summary(file_path, original_filename) + return await generate_video_summary(file_path, original_filename, llm_sem) # Check if this is a binary file that should be skipped # Skip binary files (using IGNORE_EXTENSIONS as reference) @@ -336,47 +378,8 @@ async def _generate_single_file_summary( logger.debug(f"Skipping binary file for summary generation: {file_path}") return {"name": file_name, "summary": ""} - # Read file content (limit length) - content = await viking_fs.read_file(file_path) - if isinstance(content, bytes): - # Try to decode with error handling for text files - try: - content = content.decode("utf-8") - except UnicodeDecodeError: - logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}") - return {"name": file_name, "summary": ""} - - # Limit content length (about 10000 tokens) - max_chars = 30000 - if len(content) > max_chars: - content = content[:max_chars] + "\n...(truncated)" - - # Generate summary - if not vlm.is_available(): - logger.warning("VLM not available, using empty summary") - return {"name": file_name, "summary": ""} - - # Detect file type and select appropriate prompt - file_type = self._detect_file_type(file_name) - - if file_type == FILE_TYPE_CODE: - prompt_id = "semantic.code_summary" - elif file_type == FILE_TYPE_DOCUMENTATION: - prompt_id = "semantic.document_summary" - else: - prompt_id = "semantic.file_summary" - - prompt = render_prompt( - prompt_id, - {"file_name": file_name, "content": content}, - ) - - if llm_sem: - async with llm_sem: - summary = await vlm.get_completion_async(prompt) - else: - summary = await vlm.get_completion_async(prompt) - return {"name": file_name, "summary": summary.strip()} + # Process text file + return await self._generate_text_summary(file_path, file_name, llm_sem) except Exception as e: logger.warning(f"Failed to generate summary for {file_path}: {e}", exc_info=True) From dd3e64e740f190638c47c28dee69adf57e288990 Mon Sep 17 00:00:00 2001 From: openviking Date: Fri, 20 Feb 2026 13:36:57 +0800 Subject: [PATCH 15/18] feat: vlm optimization for image --- openviking/parse/parsers/media/utils.py | 2 +- openviking/parse/tree_builder.py | 2 +- .../storage/queuefs/semantic_processor.py | 56 +++---------------- 3 files changed, 10 insertions(+), 50 deletions(-) diff --git a/openviking/parse/parsers/media/utils.py b/openviking/parse/parsers/media/utils.py index d89522e4..1e8ad30d 100644 --- a/openviking/parse/parsers/media/utils.py +++ b/openviking/parse/parsers/media/utils.py @@ -81,7 +81,7 @@ async def generate_image_summary( try: # Read image bytes - image_bytes = await viking_fs.read_file(image_uri) + image_bytes = await viking_fs.read_file_bytes(image_uri) if not isinstance(image_bytes, bytes): raise ValueError(f"Expected bytes for image file, got {type(image_bytes)}") diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 3310794b..7d3f5a89 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -151,7 +151,7 @@ async def finalize_from_temp( # 6. Enqueue to SemanticQueue for async semantic generation try: - await self._enqueue_semantic_generation(final_uri, scope) + await self._enqueue_semantic_generation(final_uri, "resource") logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}") except Exception as e: logger.error(f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True) diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 53e77738..cfad652d 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -3,7 +3,6 @@ """SemanticProcessor: Processes messages from SemanticQueue, generates .abstract.md and .overview.md.""" import asyncio -from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from openviking.core.context import Context, ResourceContentType, Vectorize @@ -13,7 +12,6 @@ FILE_TYPE_CODE, FILE_TYPE_DOCUMENTATION, FILE_TYPE_OTHER, - IGNORE_EXTENSIONS, ) from openviking.parse.parsers.media.utils import ( generate_audio_summary, @@ -21,7 +19,6 @@ generate_video_summary, get_media_type, ) -from openviking.parse.parsers.upload_utils import is_text_file from openviking.prompts import render_prompt from openviking.storage.queuefs.named_queue import DequeueHandlerBase from openviking.storage.queuefs.semantic_dag import DagStats, SemanticDagExecutor @@ -336,55 +333,18 @@ async def _generate_single_file_summary( Returns: {"name": file_name, "summary": summary_content} """ - viking_fs = get_viking_fs() file_name = file_path.split("/")[-1] - llm_sem = llm_sem or asyncio.Semaphore(self.max_concurrent_llm) - - try: - # Check if this is a media file first - p = Path(file_name) - extension = p.suffix.lower() - - # Check media type - media_type = get_media_type(file_name, None) - if media_type: - logger.info( - f"[SemanticProcessor] Generating media summary for: {file_path}, type: {media_type}" - ) - # Find the original filename by listing the directory (since file_path is like viking://resources/images/xxx/xxx.png) - parent_uri = "/".join(file_path.split("/")[:-1]) - try: - entries = await viking_fs.ls(parent_uri) - original_filename = file_name # default to file_name - for entry in entries: - name = entry.get("name", "") - if name and not name.startswith(".") and not entry.get("isDir"): - original_filename = name - break - except Exception: - original_filename = file_name - - if media_type == "image": - return await generate_image_summary(file_path, original_filename, llm_sem) - elif media_type == "audio": - return await generate_audio_summary(file_path, original_filename, llm_sem) - elif media_type == "video": - return await generate_video_summary(file_path, original_filename, llm_sem) - - # Check if this is a binary file that should be skipped - # Skip binary files (using IGNORE_EXTENSIONS as reference) - if extension in IGNORE_EXTENSIONS or not is_text_file(file_name): - logger.debug(f"Skipping binary file for summary generation: {file_path}") - return {"name": file_name, "summary": ""} - - # Process text file + media_type = get_media_type(file_name, None) + if media_type == "image": + return await generate_image_summary(file_path, file_name, llm_sem) + elif media_type == "audio": + return await generate_audio_summary(file_path, file_name, llm_sem) + elif media_type == "video": + return await generate_video_summary(file_path, file_name, llm_sem) + else: return await self._generate_text_summary(file_path, file_name, llm_sem) - except Exception as e: - logger.warning(f"Failed to generate summary for {file_path}: {e}", exc_info=True) - return {"name": file_name, "summary": ""} - def _extract_abstract_from_overview(self, overview_content: str) -> str: """Extract abstract from overview.md.""" lines = overview_content.split("\n") From d167ff14fe4e4d7f97baa959f151621b86639694 Mon Sep 17 00:00:00 2001 From: openviking Date: Fri, 20 Feb 2026 13:39:40 +0800 Subject: [PATCH 16/18] feat: vlm optimization for image --- tests/parse/test_directory_parser_routing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/parse/test_directory_parser_routing.py b/tests/parse/test_directory_parser_routing.py index 88ae61bb..349fd2f9 100644 --- a/tests/parse/test_directory_parser_routing.py +++ b/tests/parse/test_directory_parser_routing.py @@ -174,8 +174,7 @@ def test_scan_classifies_all_files_correctly( for ext in self.TEXT_FALLBACK_EXTENSIONS: assert ext in processable_exts, f"{ext} should be processable (text-fallback)" - # .bmp and .rar are unsupported - assert ".bmp" in unsupported_exts + # .rar are unsupported assert ".rar" in unsupported_exts def test_each_processable_file_has_a_parser_or_is_text( From 9c561a4f78deadc04695d796b9b8031523824832 Mon Sep 17 00:00:00 2001 From: openviking Date: Sat, 21 Feb 2026 11:00:52 +0800 Subject: [PATCH 17/18] Implement smart dual-mode for add-resource and import-ovpack, and config system improvements --- examples/ov.conf.example | 3 +- examples/server_client/ov.conf.example | 5 +- openviking/server/routers/pack.py | 12 ++- openviking/server/routers/resources.py | 54 +++++++++++- openviking/utils/media_processor.py | 11 +++ openviking_cli/client/http.py | 88 ++++++++++++++++--- openviking_cli/utils/config/agfs_config.py | 8 +- .../utils/config/open_viking_config.py | 7 +- openviking_cli/utils/config/storage_config.py | 65 +++++++++++++- .../utils/config/vectordb_config.py | 8 +- 10 files changed, 225 insertions(+), 36 deletions(-) diff --git a/examples/ov.conf.example b/examples/ov.conf.example index 34cbc6a4..06822bad 100644 --- a/examples/ov.conf.example +++ b/examples/ov.conf.example @@ -6,10 +6,10 @@ "cors_origins": ["*"] }, "storage": { + "workspace": "./data", "vectordb": { "name": "context", "backend": "local", - "path": "./data", "volcengine": { "region": "cn-beijing", "ak": null, @@ -19,7 +19,6 @@ "agfs": { "port": 1833, "log_level": "warn", - "path": "./data", "backend": "local", "timeout": 10, "retry_times": 3, diff --git a/examples/server_client/ov.conf.example b/examples/server_client/ov.conf.example index 13eb55db..57fe2ef6 100644 --- a/examples/server_client/ov.conf.example +++ b/examples/server_client/ov.conf.example @@ -6,15 +6,14 @@ "cors_origins": ["*"] }, "storage": { + "workspace": "./data", "vectordb": { "name": "context", - "backend": "local", - "path": "./data" + "backend": "local" }, "agfs": { "port": 1833, "log_level": "warn", - "path": "./data", "backend": "local" } }, diff --git a/openviking/server/routers/pack.py b/openviking/server/routers/pack.py index e486870b..6a29d4ce 100644 --- a/openviking/server/routers/pack.py +++ b/openviking/server/routers/pack.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """Pack endpoints for OpenViking HTTP Server.""" +from typing import Optional + from fastapi import APIRouter, Depends from pydantic import BaseModel @@ -22,7 +24,8 @@ class ExportRequest(BaseModel): class ImportRequest(BaseModel): """Request model for import.""" - file_path: str + file_path: Optional[str] = None + temp_path: Optional[str] = None parent: str force: bool = False vectorize: bool = True @@ -46,8 +49,13 @@ async def import_ovpack( ): """Import .ovpack file.""" service = get_service() + + file_path = request.file_path + if request.temp_path: + file_path = request.temp_path + result = await service.pack.import_ovpack( - request.file_path, + file_path, request.parent, force=request.force, vectorize=request.vectorize, diff --git a/openviking/server/routers/resources.py b/openviking/server/routers/resources.py index 7291dc2f..b1705988 100644 --- a/openviking/server/routers/resources.py +++ b/openviking/server/routers/resources.py @@ -2,14 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 """Resource endpoints for OpenViking HTTP Server.""" +import time +import uuid +from pathlib import Path from typing import Any, Optional -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, File, UploadFile from pydantic import BaseModel from openviking.server.auth import verify_api_key from openviking.server.dependencies import get_service from openviking.server.models import Response +from openviking_cli.utils.config.open_viking_config import get_openviking_config router = APIRouter(prefix="/api/v1", tags=["resources"]) @@ -17,7 +21,8 @@ class AddResourceRequest(BaseModel): """Request model for add_resource.""" - path: str + path: Optional[str] = None + temp_path: Optional[str] = None target: Optional[str] = None reason: str = "" instruction: str = "" @@ -33,6 +38,44 @@ class AddSkillRequest(BaseModel): timeout: Optional[float] = None +def _cleanup_temp_files(temp_dir: Path, max_age_hours: int = 1): + """Clean up temporary files older than max_age_hours.""" + if not temp_dir.exists(): + return + + now = time.time() + max_age_seconds = max_age_hours * 3600 + + for file_path in temp_dir.iterdir(): + if file_path.is_file(): + file_age = now - file_path.stat().st_mtime + if file_age > max_age_seconds: + file_path.unlink(missing_ok=True) + + +@router.post("/resources/temp_upload") +async def temp_upload( + file: UploadFile = File(...), + _: bool = Depends(verify_api_key), +): + """Upload a temporary file for add_resource or import_ovpack.""" + config = get_openviking_config() + temp_dir = config.storage.get_upload_temp_dir() + + # Clean up old temporary files + _cleanup_temp_files(temp_dir) + + # Save the uploaded file + file_ext = Path(file.filename).suffix if file.filename else ".tmp" + temp_filename = f"upload_{uuid.uuid4().hex}{file_ext}" + temp_file_path = temp_dir / temp_filename + + with open(temp_file_path, "wb") as f: + f.write(await file.read()) + + return Response(status="ok", result={"temp_path": str(temp_file_path)}) + + @router.post("/resources") async def add_resource( request: AddResourceRequest, @@ -40,8 +83,13 @@ async def add_resource( ): """Add resource to OpenViking.""" service = get_service() + + path = request.path + if request.temp_path: + path = request.temp_path + result = await service.resources.add_resource( - path=request.path, + path=path, target=request.target, reason=request.reason, instruction=request.instruction, diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py index 3ff58b2e..3e2475bc 100644 --- a/openviking/utils/media_processor.py +++ b/openviking/utils/media_processor.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """Unified resource processor with strategy-based routing.""" +import tempfile +import zipfile from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -103,6 +105,15 @@ async def _process_file( instruction: str, ) -> ParseResult: """Process file with unified parsing.""" + # Check if it's a zip file + if zipfile.is_zipfile(file_path): + temp_dir = Path(tempfile.mkdtemp()) + try: + with zipfile.ZipFile(file_path, "r") as zipf: + zipf.extractall(temp_dir) + return await self._process_directory(temp_dir, instruction) + finally: + pass # Don't delete temp_dir yet, it will be used by TreeBuilder return await parse( str(file_path), instruction=instruction, diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index a5cb6903..b223ffc2 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -5,6 +5,10 @@ Implements BaseClient interface using HTTP calls to OpenViking Server. """ +import tempfile +import uuid +import zipfile +from pathlib import Path from typing import Any, Dict, List, Optional, Union import httpx @@ -219,6 +223,42 @@ def _raise_exception(self, error: Dict[str, Any]) -> None: else: raise exc_class(message) + def _is_local_server(self) -> bool: + """Check if the server URL is localhost or 127.0.0.1.""" + from urllib.parse import urlparse + + parsed_url = urlparse(self._url) + hostname = parsed_url.hostname + return hostname in ("localhost", "127.0.0.1") + + def _zip_directory(self, dir_path: str) -> str: + """Create a temporary zip file from a directory.""" + dir_path = Path(dir_path) + if not dir_path.is_dir(): + raise ValueError(f"Path {dir_path} is not a directory") + + temp_dir = tempfile.gettempdir() + zip_path = Path(temp_dir) / f"temp_upload_{uuid.uuid4().hex}.zip" + + with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: + for file_path in dir_path.rglob("*"): + if file_path.is_file(): + arcname = file_path.relative_to(dir_path) + zipf.write(file_path, arcname=arcname) + + return str(zip_path) + + async def _upload_temp_file(self, file_path: str) -> str: + """Upload a file to /api/v1/resources/temp_upload and return the temp_path.""" + with open(file_path, "rb") as f: + files = {"file": (Path(file_path).name, f, "application/octet-stream")} + response = await self._http.post( + "/api/v1/resources/temp_upload", + files=files, + ) + result = self._handle_response(response) + return result.get("temp_path", "") + # ============= Resource Management ============= async def add_resource( @@ -231,16 +271,28 @@ async def add_resource( timeout: Optional[float] = None, ) -> Dict[str, Any]: """Add resource to OpenViking.""" + request_data = { + "target": target, + "reason": reason, + "instruction": instruction, + "wait": wait, + "timeout": timeout, + } + + path_obj = Path(path) + if path_obj.exists() and path_obj.is_dir() and not self._is_local_server(): + zip_path = self._zip_directory(path) + try: + temp_path = await self._upload_temp_file(zip_path) + request_data["temp_path"] = temp_path + finally: + Path(zip_path).unlink(missing_ok=True) + else: + request_data["path"] = path + response = await self._http.post( "/api/v1/resources", - json={ - "path": path, - "target": target, - "reason": reason, - "instruction": instruction, - "wait": wait, - "timeout": timeout, - }, + json=request_data, ) return self._handle_response(response) @@ -554,14 +606,22 @@ async def import_ovpack( ) -> str: """Import .ovpack file.""" parent = VikingURI.normalize(parent) + request_data = { + "parent": parent, + "force": force, + "vectorize": vectorize, + } + + file_path_obj = Path(file_path) + if file_path_obj.exists() and file_path_obj.is_file() and not self._is_local_server(): + temp_path = await self._upload_temp_file(file_path) + request_data["temp_path"] = temp_path + else: + request_data["file_path"] = file_path + response = await self._http.post( "/api/v1/pack/import", - json={ - "file_path": file_path, - "parent": parent, - "force": force, - "vectorize": vectorize, - }, + json=request_data, ) result = self._handle_response(response) return result.get("uri", "") diff --git a/openviking_cli/utils/config/agfs_config.py b/openviking_cli/utils/config/agfs_config.py index de7421e1..4f2d3d72 100644 --- a/openviking_cli/utils/config/agfs_config.py +++ b/openviking_cli/utils/config/agfs_config.py @@ -66,7 +66,10 @@ def validate_config(self): class AGFSConfig(BaseModel): """Configuration for AGFS (Agent Global File System).""" - path: str = Field(default="./data", description="AGFS data storage path") + path: Optional[str] = Field( + default=None, + description="[Deprecated in favor of `storage.workspace`] AGFS data storage path. This will be ignored if `storage.workspace` is set.", + ) port: int = Field(default=1833, description="AGFS service port") @@ -105,8 +108,7 @@ def validate_config(self): ) if self.backend == "local": - if not self.path: - raise ValueError("AGFS local backend requires 'path' to be set") + pass elif self.backend == "s3": # Validate S3 configuration diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index ac615e9c..14a36b74 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -138,7 +138,7 @@ def from_dict(cls, config: Dict[str, Any]) -> "OpenVikingConfig": # Remove sections managed by other loaders (e.g. server config) config_copy.pop("server", None) - + # Handle parser configurations from nested "parsers" section parser_configs = {} if "parsers" in config_copy: @@ -316,7 +316,7 @@ def initialize_openviking_config( Args: user: UserIdentifier for session management - path: Local storage path for embedded mode + path: Local storage path (workspace) for embedded mode Returns: Configured OpenVikingConfig instance @@ -337,9 +337,8 @@ def initialize_openviking_config( if path: # Embedded mode: local storage config.storage.agfs.backend = config.storage.agfs.backend or "local" - config.storage.agfs.path = path config.storage.vectordb.backend = config.storage.vectordb.backend or "local" - config.storage.vectordb.path = path + config.storage.workspace = path # Ensure vector dimension is synced if not set in storage if config.storage.vectordb.dimension == 0: diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py index b6ce378a..4682a10f 100644 --- a/openviking_cli/utils/config/storage_config.py +++ b/openviking_cli/utils/config/storage_config.py @@ -1,15 +1,27 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 +from pathlib import Path from typing import Any, Dict -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator + +from openviking_cli.utils.logger import get_logger from .agfs_config import AGFSConfig from .vectordb_config import VectorDBBackendConfig +logger = get_logger(__name__) + class StorageConfig(BaseModel): - """Configuration for storage backend.""" + """Configuration for storage backend. + + The `workspace` field is the primary configuration for local data storage. + When `workspace` is set, it overrides the deprecated `path` fields in + `agfs` and `vectordb` configurations. + """ + + workspace: str = Field(default="./data", description="Local data storage path (primary)") agfs: AGFSConfig = Field(default_factory=lambda: AGFSConfig(), description="AGFS configuration") @@ -23,3 +35,52 @@ class StorageConfig(BaseModel): ) model_config = {"extra": "forbid"} + + @model_validator(mode="after") + def resolve_paths(self): + """Resolve path conflicts between workspace and individual path configs. + + When workspace is set: + - Ignore agfs.path and vectordb.path + - Set agfs.path to {workspace}/.agfs + - Set vectordb.path to {workspace}/vectordb + - Warn if agfs.path or vectordb.path were explicitly set to different values + """ + workspace_path = Path(self.workspace).resolve() + + # Check for AGFS path conflict + if self.agfs.path is not None: # User explicitly set agfs.path + agfs_path = Path(self.agfs.path).resolve() + expected_agfs_path = workspace_path / ".agfs" + if agfs_path != expected_agfs_path: + logger.warning( + f"StorageConfig: 'agfs.path' is deprecated and will be ignored. " + f"Using '{expected_agfs_path}' from workspace instead of '{agfs_path}'" + ) + + # Check for VectorDB path conflict + if self.vectordb.path is not None: # User explicitly set vectordb.path + vectordb_path = Path(self.vectordb.path).resolve() + expected_vectordb_path = workspace_path / "vectordb" + if vectordb_path != expected_vectordb_path: + logger.warning( + f"StorageConfig: 'vectordb.path' is deprecated and will be ignored. " + f"Using '{expected_vectordb_path}' from workspace instead of '{vectordb_path}'" + ) + + # Update paths to use workspace + self.agfs.path = str(workspace_path / ".agfs") + self.vectordb.path = str(workspace_path / "vectordb") + + return self + + def get_upload_temp_dir(self) -> Path: + """Get the temporary directory for file uploads. + + Returns: + Path to {workspace}/temp/upload directory + """ + workspace_path = Path(self.workspace).resolve() + upload_temp_dir = workspace_path / "temp" / "upload" + upload_temp_dir.mkdir(parents=True, exist_ok=True) + return upload_temp_dir diff --git a/openviking_cli/utils/config/vectordb_config.py b/openviking_cli/utils/config/vectordb_config.py index 6984322c..2e2ccd42 100644 --- a/openviking_cli/utils/config/vectordb_config.py +++ b/openviking_cli/utils/config/vectordb_config.py @@ -46,7 +46,10 @@ class VectorDBBackendConfig(BaseModel): name: Optional[str] = Field(default=COLLECTION_NAME, description="Collection name for VectorDB") - path: Optional[str] = Field(default="./data", description="Local storage path for 'local' type") + path: Optional[str] = Field( + default=None, + description="[Deprecated in favor of `storage.workspace`] Local storage path for 'local' type. This will be ignored if `storage.workspace` is set.", + ) url: Optional[str] = Field( default=None, @@ -93,8 +96,7 @@ def validate_config(self): ) if self.backend == "local": - if not self.path: - raise ValueError("VectorDB local backend requires 'path' to be set") + pass elif self.backend == "http": if not self.url: From b100a2b7d4f65eb6d615ee20b267172a0917ad22 Mon Sep 17 00:00:00 2001 From: openviking Date: Sun, 22 Feb 2026 18:16:15 +0800 Subject: [PATCH 18/18] feat: support local upload --- crates/ov_cli/README.md | 2 + openviking/parse/directory_scan.py | 5 +- openviking_cli/utils/config/storage_config.py | 49 +++++++------------ pyproject.toml | 1 + uv.lock | 11 +++++ 5 files changed, 35 insertions(+), 33 deletions(-) diff --git a/crates/ov_cli/README.md b/crates/ov_cli/README.md index 12115a0f..e14b0a71 100644 --- a/crates/ov_cli/README.md +++ b/crates/ov_cli/README.md @@ -13,6 +13,8 @@ curl -fsSL https://raw.githubusercontent.com/volcengine/OpenViking/main/crates/o ### From Source ```bash +# openviking need rust >= 1.88, please upgrade it if necessary +# brew upgrade rust cargo install --path crates/ov_cli ``` diff --git a/openviking/parse/directory_scan.py b/openviking/parse/directory_scan.py index 8da532f2..07b10283 100644 --- a/openviking/parse/directory_scan.py +++ b/openviking/parse/directory_scan.py @@ -175,7 +175,7 @@ def _classify_file( def scan_directory( root: Union[str, Path], registry: Optional[ParserRegistry] = None, - strict: bool = True, + strict: bool = False, ignore_dirs: Optional[Set[str]] = None, include: Optional[str] = None, exclude: Optional[str] = None, @@ -272,7 +272,10 @@ def scan_directory( f"Unsupported: {unsupported_paths[:10]}{'...' if len(unsupported_paths) > 10 else ''}" ) if strict: + logger.error(msg) raise UnsupportedDirectoryFilesError(msg, unsupported_paths) + else: + logger.warning(msg) result.warnings.append(msg) for rel in unsupported_paths: result.warnings.append(f" - {rel}") diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py index 4682a10f..8daf6a79 100644 --- a/openviking_cli/utils/config/storage_config.py +++ b/openviking_cli/utils/config/storage_config.py @@ -38,40 +38,25 @@ class StorageConfig(BaseModel): @model_validator(mode="after") def resolve_paths(self): - """Resolve path conflicts between workspace and individual path configs. - - When workspace is set: - - Ignore agfs.path and vectordb.path - - Set agfs.path to {workspace}/.agfs - - Set vectordb.path to {workspace}/vectordb - - Warn if agfs.path or vectordb.path were explicitly set to different values - """ - workspace_path = Path(self.workspace).resolve() - - # Check for AGFS path conflict - if self.agfs.path is not None: # User explicitly set agfs.path - agfs_path = Path(self.agfs.path).resolve() - expected_agfs_path = workspace_path / ".agfs" - if agfs_path != expected_agfs_path: - logger.warning( - f"StorageConfig: 'agfs.path' is deprecated and will be ignored. " - f"Using '{expected_agfs_path}' from workspace instead of '{agfs_path}'" - ) - - # Check for VectorDB path conflict - if self.vectordb.path is not None: # User explicitly set vectordb.path - vectordb_path = Path(self.vectordb.path).resolve() - expected_vectordb_path = workspace_path / "vectordb" - if vectordb_path != expected_vectordb_path: - logger.warning( - f"StorageConfig: 'vectordb.path' is deprecated and will be ignored. " - f"Using '{expected_vectordb_path}' from workspace instead of '{vectordb_path}'" - ) + if self.agfs.path is not None: + logger.warning( + f"StorageConfig: 'agfs.path' is deprecated and will be ignored. " + f"Using '{self.workspace}' from workspace instead of '{self.agfs.path}'" + ) + + if self.vectordb.path is not None: + logger.warning( + f"StorageConfig: 'vectordb.path' is deprecated and will be ignored. " + f"Using '{self.workspace}' from workspace instead of '{self.vectordb.path}'" + ) # Update paths to use workspace - self.agfs.path = str(workspace_path / ".agfs") - self.vectordb.path = str(workspace_path / "vectordb") - + workspace_path = Path(self.workspace).resolve() + workspace_path.mkdir(parents=True, exist_ok=True) + self.workspace = str(workspace_path) + self.agfs.path = self.workspace + self.vectordb.path = self.workspace + # logger.info(f"StorageConfig: Using workspace '{self.workspace}' for storage") return self def get_upload_temp_dir(self) -> Path: diff --git a/pyproject.toml b/pyproject.toml index b6efd649..0924e749 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dependencies = [ "pdfminer-six>=20251230", "typer>=0.12.0", "litellm>=1.0.0", + "python-multipart>=0.0.22", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 07a6ba5d..678904d5 100644 --- a/uv.lock +++ b/uv.lock @@ -1915,6 +1915,7 @@ dependencies = [ { name = "pyagfs" }, { name = "pydantic" }, { name = "python-docx" }, + { name = "python-multipart" }, { name = "python-pptx" }, { name = "pyyaml" }, { name = "readabilipy" }, @@ -1973,6 +1974,7 @@ requires-dist = [ { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=0.21.0" }, { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" }, { name = "python-docx", specifier = ">=1.0.0" }, + { name = "python-multipart", specifier = ">=0.0.22" }, { name = "python-pptx", specifier = ">=1.0.0" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "readabilipy", specifier = ">=0.2.0" }, @@ -2588,6 +2590,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, ] +[[package]] +name = "python-multipart" +version = "0.0.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, +] + [[package]] name = "python-pptx" version = "1.0.2"