From 5d85536707bbf7d451875c055d777ee8b0b7817b Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 11:49:00 +0800
Subject: [PATCH 01/18] fix: make rust CLI (ov) commands match python CLI
 (openviking) exactly - add top-level wait/status/health commands

---
 crates/ov_cli/src/main.rs | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs
index 0d2ac0ce..e164974e 100644
--- a/crates/ov_cli/src/main.rs
+++ b/crates/ov_cli/src/main.rs
@@ -125,6 +125,16 @@ enum Commands {
         #[arg(long)]
         no_vectorize: bool,
     },
+    /// Wait for queued async processing to complete
+    Wait {
+        /// Wait timeout in seconds
+        #[arg(long)]
+        timeout: Option<f64>,
+    },
+    /// Show OpenViking component status
+    Status,
+    /// Quick health check
+    Health,
     /// System utility commands
     System {
         #[command(subcommand)]
@@ -363,6 +373,15 @@ async fn main() {
         Commands::Import { file_path, target_uri, force, no_vectorize } => {
             handle_import(file_path, target_uri, force, no_vectorize, ctx).await
         }
+        Commands::Wait { timeout } => {
+            let client = ctx.get_client();
+            commands::system::wait(&client, timeout, ctx.output_format, ctx.compact).await
+        },
+        Commands::Status => {
+            let client = ctx.get_client();
+            commands::observer::system(&client, ctx.output_format, ctx.compact).await
+        },
+        Commands::Health => handle_health(ctx).await,
         Commands::System { action } => handle_system(action, ctx).await,
         Commands::Observer { action } => handle_observer(action, ctx).await,
         Commands::Session { action } => handle_session(action, ctx).await,
@@ -651,3 +670,14 @@ async fn handle_glob(pattern: String, uri: String, ctx: CliContext) -> Result<()
     let client = ctx.get_client();
     commands::search::glob(&client, &pattern, &uri, ctx.output_format, ctx.compact).await
 }
+
+async fn handle_health(ctx: CliContext) -> Result<()> {
+    let client = ctx.get_client();
+    let system_status: serde_json::Value = client.get("/api/v1/observer/system", &[]).await?;
+    let is_healthy = system_status.get("is_healthy").and_then(|v| v.as_bool()).unwrap_or(false);
+    output::output_success(&serde_json::json!({ "healthy": is_healthy }), ctx.output_format, ctx.compact);
+    if !is_healthy {
+        std::process::exit(1);
+    }
+    Ok(())
+}

From 0e08354febd487cd2e3b874102a1ae97ae31c4d8 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 12:32:43 +0800
Subject: [PATCH 02/18] fix: ov cli plays same as py cli (ls, tree)

---
 README.md                                 |  2 +-
 README_CN.md                              |  2 +-
 crates/ov_cli/src/client.rs               | 16 ++++++++--
 crates/ov_cli/src/commands/filesystem.rs  | 12 ++++++--
 crates/ov_cli/src/main.rs                 | 36 ++++++++++++++++++-----
 openviking/async_client.py                |  7 ++++-
 openviking/client/local.py                |  7 ++++-
 openviking/server/routers/filesystem.py   |  9 +++++-
 openviking/service/fs_service.py          | 15 ++++++++--
 openviking/storage/viking_fs.py           | 23 +++++++++++----
 openviking_cli/cli/commands/filesystem.py | 13 +++++++-
 openviking_cli/client/base.py             |  2 ++
 openviking_cli/client/http.py             |  4 +++
 openviking_cli/client/sync_http.py        | 11 +++++--
 14 files changed, 130 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index b8107d29..8f674818 100644
--- a/README.md
+++ b/README.md
@@ -581,7 +581,7 @@ Let's work together to define and build the future of AI Agent context managemen
 
 ### Star Trend
 
-[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=Timeline)](https://www.star-history.com/#volcengine/OpenViking&Timeline)
+[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=timeline&legend=top-left)](https://www.star-history.com/#volcengine/OpenViking&type=timeline&legend=top-left)
 
 ---
 
diff --git a/README_CN.md b/README_CN.md
index 0f2e76ca..c09354ed 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -457,7 +457,7 @@ OpenViking 目前还处于早期阶段，有许多需要完善和探索的地方
 
 ### Star 趋势
 
-[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=Timeline)](https://www.star-history.com/#volcengine/OpenViking&Timeline)
+[![Star History Chart](https://api.star-history.com/svg?repos=volcengine/OpenViking&type=timeline&legend=top-left)](https://www.star-history.com/#volcengine/OpenViking&type=timeline&legend=top-left)
 
 ---
 
diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs
index 53dc3b7b..2c2bcc8b 100644
--- a/crates/ov_cli/src/client.rs
+++ b/crates/ov_cli/src/client.rs
@@ -191,17 +191,27 @@ impl HttpClient {
 
     // ============ Filesystem Methods ============
 
-    pub async fn ls(&self, uri: &str, simple: bool, recursive: bool) -> Result<serde_json::Value> {
+    pub async fn ls(&self, uri: &str, simple: bool, recursive: bool, output: &str, abs_limit: i32, show_all_hidden: bool, node_limit: i32) -> Result<serde_json::Value> {
         let params = vec![
             ("uri".to_string(), uri.to_string()),
             ("simple".to_string(), simple.to_string()),
             ("recursive".to_string(), recursive.to_string()),
+            ("output".to_string(), output.to_string()),
+            ("abs_limit".to_string(), abs_limit.to_string()),
+            ("show_all_hidden".to_string(), show_all_hidden.to_string()),
+            ("node_limit".to_string(), node_limit.to_string()),
         ];
         self.get("/api/v1/fs/ls", &params).await
     }
 
-    pub async fn tree(&self, uri: &str) -> Result<serde_json::Value> {
-        let params = vec![("uri".to_string(), uri.to_string())];
+    pub async fn tree(&self, uri: &str, output: &str, abs_limit: i32, show_all_hidden: bool, node_limit: i32) -> Result<serde_json::Value> {
+        let params = vec![
+            ("uri".to_string(), uri.to_string()),
+            ("output".to_string(), output.to_string()),
+            ("abs_limit".to_string(), abs_limit.to_string()),
+            ("show_all_hidden".to_string(), show_all_hidden.to_string()),
+            ("node_limit".to_string(), node_limit.to_string()),
+        ];
         self.get("/api/v1/fs/tree", &params).await
     }
 
diff --git a/crates/ov_cli/src/commands/filesystem.rs b/crates/ov_cli/src/commands/filesystem.rs
index 0034c1a0..b281f1c7 100644
--- a/crates/ov_cli/src/commands/filesystem.rs
+++ b/crates/ov_cli/src/commands/filesystem.rs
@@ -7,10 +7,14 @@ pub async fn ls(
     uri: &str,
     simple: bool,
     recursive: bool,
+    output: &str,
+    abs_limit: i32,
+    show_all_hidden: bool,
+    node_limit: i32,
     output_format: OutputFormat,
     compact: bool,
 ) -> Result<()> {
-    let result = client.ls(uri, simple, recursive).await?;
+    let result = client.ls(uri, simple, recursive, output, abs_limit, show_all_hidden, node_limit).await?;
     output_success(&result, output_format, compact);
     Ok(())
 }
@@ -18,10 +22,14 @@ pub async fn ls(
 pub async fn tree(
     client: &HttpClient,
     uri: &str,
+    output: &str,
+    abs_limit: i32,
+    show_all_hidden: bool,
+    node_limit: i32,
     output_format: OutputFormat,
     compact: bool,
 ) -> Result<()> {
-    let result = client.tree(uri).await?;
+    let result = client.tree(uri, output, abs_limit, show_all_hidden, node_limit).await?;
     output_success(&result, output_format, compact);
     Ok(())
 }
diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs
index e164974e..b98aaf4c 100644
--- a/crates/ov_cli/src/main.rs
+++ b/crates/ov_cli/src/main.rs
@@ -162,11 +162,29 @@ enum Commands {
         /// List all subdirectories recursively
         #[arg(short, long)]
         recursive: bool,
+        /// Abstract content limit (only for agent output)
+        #[arg(long = "abs-limit", short = 'l', default_value = "256")]
+        abs_limit: i32,
+        /// Show all hidden files
+        #[arg(short, long)]
+        all: bool,
+        /// Maximum number of nodes to list
+        #[arg(long = "node-limit", short = 'n', default_value = "1000")]
+        node_limit: i32,
     },
     /// Get directory tree
     Tree {
         /// Viking URI to get tree for
         uri: String,
+        /// Abstract content limit (only for agent output)
+        #[arg(long = "abs-limit", short = 'l', default_value = "128")]
+        abs_limit: i32,
+        /// Show all hidden files
+        #[arg(short, long)]
+        all: bool,
+        /// Maximum number of nodes to list
+        #[arg(long = "node-limit", short = 'n', default_value = "1000")]
+        node_limit: i32,
     },
     /// Create directory
     Mkdir {
@@ -385,11 +403,11 @@ async fn main() {
         Commands::System { action } => handle_system(action, ctx).await,
         Commands::Observer { action } => handle_observer(action, ctx).await,
         Commands::Session { action } => handle_session(action, ctx).await,
-        Commands::Ls { uri, simple, recursive } => {
-            handle_ls(uri, simple, recursive, ctx).await
+        Commands::Ls { uri, simple, recursive, abs_limit, all, node_limit } => {
+            handle_ls(uri, simple, recursive, abs_limit, all, node_limit, ctx).await
         }
-        Commands::Tree { uri } => {
-            handle_tree(uri, ctx).await
+        Commands::Tree { uri, abs_limit, all, node_limit } => {
+            handle_tree(uri, abs_limit, all, node_limit, ctx).await
         }
         Commands::Mkdir { uri } => {
             handle_mkdir(uri, ctx).await
@@ -631,14 +649,16 @@ async fn handle_search(
     commands::search::search(&client, &query, &uri, session_id, limit, threshold, ctx.output_format, ctx.compact).await
 }
 
-async fn handle_ls(uri: String, simple: bool, recursive: bool, ctx: CliContext) -> Result<()> {
+async fn handle_ls(uri: String, simple: bool, recursive: bool, abs_limit: i32, show_all_hidden: bool, node_limit: i32, ctx: CliContext) -> Result<()> {
     let client = ctx.get_client();
-    commands::filesystem::ls(&client, &uri, simple, recursive, ctx.output_format, ctx.compact).await
+    let api_output = if ctx.compact { "agent" } else { "original" };
+    commands::filesystem::ls(&client, &uri, simple, recursive, api_output, abs_limit, show_all_hidden, node_limit, ctx.output_format, ctx.compact).await
 }
 
-async fn handle_tree(uri: String, ctx: CliContext) -> Result<()> {
+async fn handle_tree(uri: String, abs_limit: i32, show_all_hidden: bool, node_limit: i32, ctx: CliContext) -> Result<()> {
     let client = ctx.get_client();
-    commands::filesystem::tree(&client, &uri, ctx.output_format, ctx.compact).await
+    let api_output = if ctx.compact { "agent" } else { "original" };
+    commands::filesystem::tree(&client, &uri, api_output, abs_limit, show_all_hidden, node_limit, ctx.output_format, ctx.compact).await
 }
 
 async fn handle_mkdir(uri: String, ctx: CliContext) -> Result<()> {
diff --git a/openviking/async_client.py b/openviking/async_client.py
index b4f48ed5..84e3c7e1 100644
--- a/openviking/async_client.py
+++ b/openviking/async_client.py
@@ -306,8 +306,13 @@ async def tree(self, uri: str, **kwargs) -> Dict:
         output = kwargs.get("output", "original")
         abs_limit = kwargs.get("abs_limit", 128)
         show_all_hidden = kwargs.get("show_all_hidden", True)
+        node_limit = kwargs.get("node_limit", 1000)
         return await self._client.tree(
-            uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+            uri,
+            output=output,
+            abs_limit=abs_limit,
+            show_all_hidden=show_all_hidden,
+            node_limit=node_limit,
         )
 
     async def mkdir(self, uri: str) -> None:
diff --git a/openviking/client/local.py b/openviking/client/local.py
index 46acca99..389439ac 100644
--- a/openviking/client/local.py
+++ b/openviking/client/local.py
@@ -113,10 +113,15 @@ async def tree(
         output: str = "original",
         abs_limit: int = 128,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """Get directory tree."""
         return await self._service.fs.tree(
-            uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+            uri,
+            output=output,
+            abs_limit=abs_limit,
+            show_all_hidden=show_all_hidden,
+            node_limit=node_limit,
         )
 
     async def stat(self, uri: str) -> Dict[str, Any]:
diff --git a/openviking/server/routers/filesystem.py b/openviking/server/routers/filesystem.py
index 5e71fe65..a24e64c6 100644
--- a/openviking/server/routers/filesystem.py
+++ b/openviking/server/routers/filesystem.py
@@ -22,6 +22,7 @@ async def ls(
     output: str = Query("agent", description="Output format: original or agent"),
     abs_limit: int = Query(256, description="Abstract limit (only for agent output)"),
     show_all_hidden: bool = Query(False, description="List all hidden files, like -a"),
+    node_limit: int = Query(1000, description="Maximum number of nodes to list"),
     _: bool = Depends(verify_api_key),
 ):
     """List directory contents."""
@@ -33,6 +34,7 @@ async def ls(
         output=output,
         abs_limit=abs_limit,
         show_all_hidden=show_all_hidden,
+        node_limit=node_limit,
     )
     return Response(status="ok", result=result)
 
@@ -43,12 +45,17 @@ async def tree(
     output: str = Query("agent", description="Output format: original or agent"),
     abs_limit: int = Query(256, description="Abstract limit (only for agent output)"),
     show_all_hidden: bool = Query(False, description="List all hidden files, like -a"),
+    node_limit: int = Query(1000, description="Maximum number of nodes to list"),
     _: bool = Depends(verify_api_key),
 ):
     """Get directory tree."""
     service = get_service()
     result = await service.fs.tree(
-        uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+        uri,
+        output=output,
+        abs_limit=abs_limit,
+        show_all_hidden=show_all_hidden,
+        node_limit=node_limit,
     )
     return Response(status="ok", result=result)
 
diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py
index 72e5526d..598e0fc0 100644
--- a/openviking/service/fs_service.py
+++ b/openviking/service/fs_service.py
@@ -39,6 +39,7 @@ async def ls(
         output: str = "original",
         abs_limit: int = 256,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Any]:
         """List directory contents.
 
@@ -49,12 +50,17 @@ async def ls(
             output: str = "original" or "agent"
             abs_limit: int = 256 if output == "agent" else ignore
             show_all_hidden: bool = False (list all hidden files, like -a)
+            node_limit: int = 1000 (maximum number of nodes to list)
         """
         viking_fs = self._ensure_initialized()
 
         if recursive:
             entries = await viking_fs.tree(
-                uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+                uri,
+                output=output,
+                abs_limit=abs_limit,
+                show_all_hidden=show_all_hidden,
+                node_limit=node_limit,
             )
         else:
             entries = await viking_fs.ls(
@@ -86,11 +92,16 @@ async def tree(
         output: str = "original",
         abs_limit: int = 128,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """Get directory tree."""
         viking_fs = self._ensure_initialized()
         return await viking_fs.tree(
-            uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+            uri,
+            output=output,
+            abs_limit=abs_limit,
+            show_all_hidden=show_all_hidden,
+            node_limit=node_limit,
         )
 
     async def stat(self, uri: str) -> Dict[str, Any]:
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 3f3f173e..a8e52c72 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -201,9 +201,9 @@ async def stat(self, uri: str) -> Dict[str, Any]:
         path = self._uri_to_path(uri)
         return self.agfs.stat(path)
 
-    async def glob(self, pattern: str, uri: str = "viking://") -> Dict:
+    async def glob(self, pattern: str, uri: str = "viking://", node_limit: int = 1000) -> Dict:
         """File pattern matching, supports **/*.md recursive."""
-        entries = await self.tree(uri)
+        entries = await self.tree(uri, node_limit=node_limit)
         base_uri = uri.rstrip("/")
         matches = []
         for entry in entries:
@@ -248,6 +248,7 @@ async def tree(
         output: str = "original",
         abs_limit: int = 256,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """
         Recursively list all contents (includes rel_path).
@@ -265,19 +266,25 @@ async def tree(
         [{'name': '.abstract.md', 'size': 100, 'modTime': '2026-02-11 16:52:16', 'isDir': False, 'rel_path': '.abstract.md', 'uri': 'viking://resources...', 'abstract': "..."}]
         """
         if output == "original":
-            return await self._tree_original(uri, show_all_hidden)
+            return await self._tree_original(uri, show_all_hidden, node_limit)
         elif output == "agent":
-            return await self._tree_agent(uri, abs_limit, show_all_hidden)
+            return await self._tree_agent(uri, abs_limit, show_all_hidden, node_limit)
         else:
             raise ValueError(f"Invalid output format: {output}")
 
-    async def _tree_original(self, uri: str, show_all_hidden: bool = False) -> List[Dict[str, Any]]:
+    async def _tree_original(
+        self, uri: str, show_all_hidden: bool = False, node_limit: int = 1000
+    ) -> List[Dict[str, Any]]:
         """Recursively list all contents (original format)."""
         path = self._uri_to_path(uri)
         all_entries = []
 
         async def _walk(current_path: str, current_rel: str):
+            if len(all_entries) >= node_limit:
+                return
             for entry in self.agfs.ls(current_path):
+                if len(all_entries) >= node_limit:
+                    break
                 name = entry.get("name", "")
                 if name in [".", ".."]:
                     continue
@@ -297,7 +304,7 @@ async def _walk(current_path: str, current_rel: str):
         return all_entries
 
     async def _tree_agent(
-        self, uri: str, abs_limit: int, show_all_hidden: bool = False
+        self, uri: str, abs_limit: int, show_all_hidden: bool = False, node_limit: int = 1000
     ) -> List[Dict[str, Any]]:
         """Recursively list all contents (agent format with abstracts)."""
         path = self._uri_to_path(uri)
@@ -305,7 +312,11 @@ async def _tree_agent(
         now = datetime.now()
 
         async def _walk(current_path: str, current_rel: str):
+            if len(all_entries) >= node_limit:
+                return
             for entry in self.agfs.ls(current_path):
+                if len(all_entries) >= node_limit:
+                    break
                 name = entry.get("name", "")
                 if name in [".", ".."]:
                     continue
diff --git a/openviking_cli/cli/commands/filesystem.py b/openviking_cli/cli/commands/filesystem.py
index 9b776562..27c7c44a 100644
--- a/openviking_cli/cli/commands/filesystem.py
+++ b/openviking_cli/cli/commands/filesystem.py
@@ -26,6 +26,9 @@ def ls_command(
         ),
         abs_limit: int = typer.Option(256, "--abs-limit", "-l", help="Abstract content limit"),
         show_all_hidden: bool = typer.Option(False, "--all", "-a", help="Show all hidden files"),
+        node_limit: int = typer.Option(
+            1000, "--node-limit", "-n", help="Maximum number of nodes to list"
+        ),
     ) -> None:
         """List directory contents."""
         run(
@@ -37,6 +40,7 @@ def ls_command(
                 output=output_format,
                 abs_limit=abs_limit,
                 show_all_hidden=show_all_hidden,
+                node_limit=node_limit,
             ),
         )
 
@@ -49,6 +53,9 @@ def tree_command(
         ),
         abs_limit: int = typer.Option(128, "--abs-limit", "-l", help="Abstract content limit"),
         show_all_hidden: bool = typer.Option(False, "--all", "-a", help="Show all hidden files"),
+        node_limit: int = typer.Option(
+            1000, "--node-limit", "-n", help="Maximum number of nodes to list"
+        ),
     ) -> None:
         """
         Get directory tree info.
@@ -56,7 +63,11 @@ def tree_command(
         run(
             ctx,
             lambda client: client.tree(
-                uri, output=output_format, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+                uri,
+                output=output_format,
+                abs_limit=abs_limit,
+                show_all_hidden=show_all_hidden,
+                node_limit=node_limit,
             ),
         )
 
diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py
index b72d4f3a..7882b585 100644
--- a/openviking_cli/client/base.py
+++ b/openviking_cli/client/base.py
@@ -68,6 +68,7 @@ async def ls(
         output: str = "original",
         abs_limit: int = 256,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Any]:
         """List directory contents."""
         ...
@@ -79,6 +80,7 @@ async def tree(
         output: str = "original",
         abs_limit: int = 128,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """Get directory tree."""
         ...
diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py
index a8ae3726..39526602 100644
--- a/openviking_cli/client/http.py
+++ b/openviking_cli/client/http.py
@@ -278,6 +278,7 @@ async def ls(
         output: str = "original",
         abs_limit: int = 256,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Any]:
         """List directory contents."""
         response = await self._http.get(
@@ -289,6 +290,7 @@ async def ls(
                 "output": output,
                 "abs_limit": abs_limit,
                 "show_all_hidden": show_all_hidden,
+                "node_limit": node_limit,
             },
         )
         return self._handle_response(response)
@@ -299,6 +301,7 @@ async def tree(
         output: str = "original",
         abs_limit: int = 128,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """Get directory tree."""
         response = await self._http.get(
@@ -308,6 +311,7 @@ async def tree(
                 "output": output,
                 "abs_limit": abs_limit,
                 "show_all_hidden": show_all_hidden,
+                "node_limit": node_limit,
             },
         )
         return self._handle_response(response)
diff --git a/openviking_cli/client/sync_http.py b/openviking_cli/client/sync_http.py
index fdfa450f..97a9e0f2 100644
--- a/openviking_cli/client/sync_http.py
+++ b/openviking_cli/client/sync_http.py
@@ -160,6 +160,7 @@ def ls(
         output: str = "original",
         abs_limit: int = 256,
         show_all_hidden: bool = False,
+        node_limit: int = 1000,
     ) -> List[Any]:
         """List directory contents."""
         return run_async(
@@ -170,6 +171,7 @@ def ls(
                 output=output,
                 abs_limit=abs_limit,
                 show_all_hidden=show_all_hidden,
+                node_limit=node_limit,
             )
         )
 
@@ -179,11 +181,16 @@ def tree(
         output: str = "original",
         abs_limit: int = 128,
         show_all_hidden: bool = False,
-    ) -> Dict:
+        node_limit: int = 1000,
+    ) -> List[Dict[str, Any]]:
         """Get directory tree."""
         return run_async(
             self._async_client.tree(
-                uri, output=output, abs_limit=abs_limit, show_all_hidden=show_all_hidden
+                uri,
+                output=output,
+                abs_limit=abs_limit,
+                show_all_hidden=show_all_hidden,
+                node_limit=node_limit,
             )
         )
 

From 7376d61aabd1eefbdbf794de61ca81b467dc8c91 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 13:24:35 +0800
Subject: [PATCH 03/18] Refactor media parsers to subdirectory structure with
 validation

---
 openviking/parse/parsers/README.md            |  17 +
 openviking/parse/parsers/media/__init__.py    |   8 +
 openviking/parse/parsers/media/audio.py       | 313 ++++++++++++++++++
 .../parsers/{media.py => media/image.py}      | 275 +--------------
 openviking/parse/parsers/media/video.py       | 292 ++++++++++++++++
 openviking/parse/registry.py                  |  10 +-
 openviking/parse/tree_builder.py              |  38 ++-
 7 files changed, 689 insertions(+), 264 deletions(-)
 create mode 100644 openviking/parse/parsers/media/__init__.py
 create mode 100644 openviking/parse/parsers/media/audio.py
 rename openviking/parse/parsers/{media.py => media/image.py} (53%)
 create mode 100644 openviking/parse/parsers/media/video.py

diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md
index b8bcefe0..7f452eb0 100644
--- a/openviking/parse/parsers/README.md
+++ b/openviking/parse/parsers/README.md
@@ -148,6 +148,23 @@ L1: """
 
 多媒体解析器，使用 VLM（视觉语言模型）分析图像、视频和音频内容，生成文本描述。
 
+对于添加多媒体文件的存储组织方式，我们采用以下策略：
+* 在 viking://resource 下创建 images, audio, video 三个媒体子目录，分别是：
+  * viking://resource/images 用于存储提交时未明确指定目标路径的图片文件
+  * viking://resource/audio 用于存储提交时未明确指定目标路径的音频文件
+  * viking://resource/video 用于存储提交时未明确指定目标路径的视频文件
+* 对于每个媒体子目录下，每次上传的文件放在当前日期（而非文件内部元信息时间）的子目录下，例如：
+  * viking://resource/images/20240820/ 内存储 20240820 上传的所有图片文件
+* 对于每个多媒体文件，默认创建一个文件夹，文件夹名称与文件名想同但默认不包含后缀，例如：
+  * 上传文件 `20240820_123456.jpg` 后，默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456` 用于存储该文件的相关内容
+  * 该文件夹内默认包含一个 `.abstract.md` 文件，用于存储该文件的摘要信息
+    * 例如：图片文件的摘要可能是图片的文件名、内容描述、画面风格等，正常不应超过 200 token
+  * 该文件夹内默认包含一个 `.overview.md` 文件，用于存储该文件的概览内容，例如：
+    * 图片文件的概览内容除了包含 `.abstract.md` 中的内容，还可能包含图片的尺寸、画面风格、OCR 识别结果、场景和主体描述等
+    * 音频文件的概览内容可能包含音频的文件名、时长、语音或歌词识别结果，以及主要的章节对应的时间线等
+    * 视频文件的概览内容可能包含视频的文件名、时长、使用场景等，对于较大的视频，未来会对视频进行切分，可能会继续递归用子文件夹存储切分后的视频文件、音轨文件、关键画面的截图等，因此视频的处理逻辑预期较为复杂，可等待图片、音频实现后，参考文件夹或 zip 的递归处理形态进行处理。
+
+
 ## 核心组件
 
 ### BaseParser (`base_parser.py`)
diff --git a/openviking/parse/parsers/media/__init__.py b/openviking/parse/parsers/media/__init__.py
new file mode 100644
index 00000000..7fed46b5
--- /dev/null
+++ b/openviking/parse/parsers/media/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+
+from .audio import AudioParser
+from .image import ImageParser
+from .video import VideoParser
+
+__all__ = ["ImageParser", "AudioParser", "VideoParser"]
diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
new file mode 100644
index 00000000..372a0cab
--- /dev/null
+++ b/openviking/parse/parsers/media/audio.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Audio parser - Future implementation.
+
+Planned Features:
+1. Speech-to-text transcription using ASR models
+2. Audio metadata extraction (duration, sample rate, channels)
+3. Speaker diarization (identify different speakers)
+4. Timestamp alignment for transcribed text
+5. Generate structured ResourceNode with transcript
+
+Example workflow:
+    1. Load audio file
+    2. Extract metadata (duration, format, sample rate)
+    3. Transcribe speech to text using Whisper or similar
+    4. (Optional) Perform speaker diarization
+    5. Create ResourceNode with:
+       - type: NodeType.ROOT
+       - children: sections for each speaker/timestamp
+       - meta: audio metadata and timestamps
+    6. Return ParseResult
+
+Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A
+"""
+
+from pathlib import Path
+from typing import List, Optional, Union
+
+from openviking.parse.base import NodeType, ParseResult, ResourceNode
+from openviking.parse.parsers.base_parser import BaseParser
+from openviking_cli.utils.config.parser_config import AudioConfig
+
+
+class AudioParser(BaseParser):
+    """
+    Audio parser for audio files.
+    """
+
+    def __init__(self, config: Optional[AudioConfig] = None, **kwargs):
+        """
+        Initialize AudioParser.
+
+        Args:
+            config: Audio parsing configuration
+            **kwargs: Additional configuration parameters
+        """
+        self.config = config or AudioConfig()
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        """Return supported audio file extensions."""
+        return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
+
+    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
+        """
+        Parse audio file using three-phase architecture.
+
+        Phase 1: Generate temporary files
+        - Copy original audio to temp_uri/content.{ext}
+        - Generate description.md using ASR
+        - (Optional) Generate transcript with timestamps
+
+        Phase 2: Generate semantic info
+        - Generate abstract and overview based on description.md
+        - Overview includes file list and usage instructions
+
+        Phase 3: Build directory structure
+        - Move all files to final URI
+        - Generate .abstract.md, .overview.md
+
+        Args:
+            source: Audio file path
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with audio content
+
+        Raises:
+            FileNotFoundError: If source file does not exist
+            IOError: If audio processing fails
+        """
+        from openviking.storage.viking_fs import get_viking_fs
+
+        # Convert to Path object
+        file_path = Path(source) if isinstance(source, str) else source
+        if not file_path.exists():
+            raise FileNotFoundError(f"Audio file not found: {source}")
+
+        viking_fs = get_viking_fs()
+        temp_uri = viking_fs.create_temp_uri()
+
+        # Phase 1: Generate temporary files
+        audio_bytes = file_path.read_bytes()
+        ext = file_path.suffix
+
+        root_dir_name = file_path.stem
+        root_dir_uri = f"{temp_uri}/{root_dir_name}"
+        await viking_fs.mkdir(root_dir_uri)
+
+        # 1.1 Save original audio
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", audio_bytes)
+
+        # 1.2 Validate audio file using magic bytes
+        # Define magic bytes for supported audio formats
+        audio_magic_bytes = {
+            ".mp3": [b"ID3", b"\xff\xfb", b"\xff\xf3", b"\xff\xf2"],
+            ".wav": [b"RIFF"],
+            ".ogg": [b"OggS"],
+            ".flac": [b"fLaC"],
+            ".aac": [b"\xff\xf1", b"\xff\xf9"],
+            ".m4a": [b"\x00\x00\x00", b"ftypM4A", b"ftypisom"],
+            ".opus": [b"OggS"],
+        }
+
+        # Check magic bytes
+        valid = False
+        ext_lower = ext.lower()
+        magic_list = audio_magic_bytes.get(ext_lower, [])
+        for magic in magic_list:
+            if len(audio_bytes) >= len(magic) and audio_bytes.startswith(magic):
+                valid = True
+                break
+
+        if not valid:
+            raise ValueError(
+                f"Invalid audio file: {file_path}. File signature does not match expected format {ext_lower}"
+            )
+
+        # Extract audio metadata (placeholder)
+        duration = 0
+        sample_rate = 0
+        channels = 0
+        format_str = ext[1:].upper()
+
+        # 1.3 Generate ASR description
+        description = ""
+        if self.config.enable_transcription:
+            description = await self._asr_transcribe(audio_bytes, self.config.asr_model)
+        else:
+            # Fallback: basic description
+            description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)"
+
+        await viking_fs.write_file(f"{root_dir_uri}/description.md", description)
+
+        # 1.4 Transcript with timestamps (optional)
+        transcript_text = None
+        if self.config.enable_transcription and self.config.enable_timestamps:
+            transcript_text = await self._asr_transcribe_with_timestamps(
+                audio_bytes, self.config.asr_model
+            )
+            if transcript_text:
+                await viking_fs.write_file(f"{root_dir_uri}/transcript.md", transcript_text)
+
+        # Create ResourceNode
+        root_node = ResourceNode(
+            type=NodeType.ROOT,
+            title=file_path.stem,
+            level=0,
+            detail_file=None,
+            content_path=None,
+            children=[],
+            meta={
+                "duration": duration,
+                "sample_rate": sample_rate,
+                "channels": channels,
+                "format": format_str.lower(),
+                "content_type": "audio",
+                "source_title": file_path.stem,
+                "semantic_name": file_path.stem,
+            },
+        )
+
+        # Phase 2: Generate semantic info
+        await self._generate_semantic_info(
+            root_node, root_dir_uri, viking_fs, transcript_text is not None
+        )
+
+        # Phase 3: Build directory structure (handled by TreeBuilder)
+        return ParseResult(
+            root=root_node,
+            source_path=str(file_path),
+            temp_dir_path=temp_uri,
+            source_format="audio",
+            parser_name="AudioParser",
+            meta={"content_type": "audio", "format": format_str.lower()},
+        )
+
+    async def _asr_transcribe(self, audio_bytes: bytes, model: Optional[str]) -> str:
+        """
+        Generate audio transcription using ASR.
+
+        Args:
+            audio_bytes: Audio binary data
+            model: ASR model name
+
+        Returns:
+            Audio transcription in markdown format
+
+        TODO: Integrate with actual ASR API (Whisper, etc.)
+        """
+        # Fallback implementation - returns basic placeholder
+        return "Audio transcription (ASR integration pending)\n\nThis is an audio. ASR transcription feature has not yet integrated external API."
+
+    async def _asr_transcribe_with_timestamps(
+        self, audio_bytes: bytes, model: Optional[str]
+    ) -> Optional[str]:
+        """
+        Extract transcription with timestamps from audio using ASR.
+
+        Args:
+            audio_bytes: Audio binary data
+            model: ASR model name
+
+        Returns:
+            Transcript with timestamps in markdown format, or None if not available
+
+        TODO: Integrate with ASR API
+        """
+        # Not implemented - return None
+        return None
+
+    async def _generate_semantic_info(
+        self, node: ResourceNode, temp_uri: str, viking_fs, has_transcript: bool
+    ):
+        """
+        Phase 2: Generate abstract and overview.
+
+        Args:
+            node: ResourceNode to update
+            temp_uri: Temporary URI
+            viking_fs: VikingFS instance
+            has_transcript: Whether transcript file exists
+        """
+        # Read description.md
+        description = await viking_fs.read_file(f"{temp_uri}/description.md")
+
+        # Generate abstract (short summary, < 100 tokens)
+        abstract = description[:200] if len(description) > 200 else description
+
+        # Generate overview (content summary + file list + usage instructions)
+        overview_parts = [
+            "## Content Summary\n",
+            description,
+            "\n\n## Available Files\n",
+            f"- content.{node.meta['format']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n",
+            "- description.md: Detailed audio transcription generated by ASR\n",
+        ]
+
+        if has_transcript:
+            overview_parts.append("- transcript.md: Transcript with timestamps from the audio\n")
+
+        overview_parts.append("\n## Usage\n")
+        overview_parts.append("### Play Audio\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("audio_bytes = await audio_resource.play()\n")
+        overview_parts.append("# Returns: Audio file binary data\n")
+        overview_parts.append("# Purpose: Play or save the audio\n")
+        overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get ASR-generated Transcription\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("transcription = await audio_resource.transcription()\n")
+        overview_parts.append("# Returns: FileContent object for further processing\n")
+        overview_parts.append("# Purpose: Understand audio content\n")
+        overview_parts.append("```\n\n")
+
+        if has_transcript:
+            overview_parts.append("### Get Timestamps Transcript\n")
+            overview_parts.append("```python\n")
+            overview_parts.append("timestamps = await audio_resource.timestamps()\n")
+            overview_parts.append("# Returns: FileContent object or None\n")
+            overview_parts.append("# Purpose: Extract timestamped transcript from the audio\n")
+            overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get Audio Metadata\n")
+        overview_parts.append("```python\n")
+        overview_parts.append(
+            f"duration = audio_resource.get_duration()  # {node.meta['duration']}s\n"
+        )
+        overview_parts.append(
+            f"sample_rate = audio_resource.get_sample_rate()  # {node.meta['sample_rate']}Hz\n"
+        )
+        overview_parts.append(
+            f"channels = audio_resource.get_channels()  # {node.meta['channels']}\n"
+        )
+        overview_parts.append(f'format = audio_resource.get_format()  # "{node.meta["format"]}"\n')
+        overview_parts.append("```\n")
+
+        overview = "".join(overview_parts)
+
+        # Store in node meta
+        node.meta["abstract"] = abstract
+        node.meta["overview"] = overview
+
+    async def parse_content(
+        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
+    ) -> ParseResult:
+        """
+        Parse audio from content string - Not yet implemented.
+
+        Args:
+            content: Audio content (base64 or binary string)
+            source_path: Optional source path for metadata
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with audio content
+
+        Raises:
+            NotImplementedError: This feature is not yet implemented
+        """
+        raise NotImplementedError("Audio parsing from content not yet implemented")
diff --git a/openviking/parse/parsers/media.py b/openviking/parse/parsers/media/image.py
similarity index 53%
rename from openviking/parse/parsers/media.py
rename to openviking/parse/parsers/media/image.py
index cb25a803..0965b730 100644
--- a/openviking/parse/parsers/media.py
+++ b/openviking/parse/parsers/media/image.py
@@ -17,7 +17,7 @@
 
 from openviking.parse.base import NodeType, ParseResult, ResourceNode
 from openviking.parse.parsers.base_parser import BaseParser
-from openviking_cli.utils.config.parser_config import AudioConfig, ImageConfig, VideoConfig
+from openviking_cli.utils.config.parser_config import ImageConfig
 
 # =============================================================================
 # Configuration Classes
@@ -106,17 +106,23 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         image_bytes = file_path.read_bytes()
         ext = file_path.suffix
 
+        root_dir_name = file_path.stem
+        root_dir_uri = f"{temp_uri}/{root_dir_name}"
+        await viking_fs.mkdir(root_dir_uri)
+
         # 1.1 Save original image
-        await viking_fs.write_file_bytes(f"{temp_uri}/content{ext}", image_bytes)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", image_bytes)
 
-        # 1.2 Extract image metadata
+        # 1.2 Validate and extract image metadata
         try:
+            img = Image.open(file_path)
+            img.verify()  # Verify that it's a valid image
+            img.close()  # Close and reopen to reset after verify()
             img = Image.open(file_path)
             width, height = img.size
             format_str = img.format or ext[1:].upper()
-        except Exception:
-            width, height = 0, 0
-            format_str = ext[1:].upper()
+        except Exception as e:
+            raise ValueError(f"Invalid image file: {file_path}. Error: {e}") from e
 
         # 1.3 Generate VLM description
         description = ""
@@ -126,14 +132,14 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             # Fallback: basic description
             description = f"Image file: {file_path.name} ({format_str}, {width}x{height})"
 
-        await viking_fs.write_file(f"{temp_uri}/description.md", description)
+        await viking_fs.write_file(f"{root_dir_uri}/description.md", description)
 
         # 1.4 OCR (optional)
         ocr_text = None
         if self.config.enable_ocr:
             ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang)
             if ocr_text:
-                await viking_fs.write_file(f"{temp_uri}/ocr.md", ocr_text)
+                await viking_fs.write_file(f"{root_dir_uri}/ocr.md", ocr_text)
 
         # Create ResourceNode
         root_node = ResourceNode(
@@ -154,7 +160,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         )
 
         # Phase 2: Generate semantic info
-        await self._generate_semantic_info(root_node, temp_uri, viking_fs, ocr_text is not None)
+        await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, ocr_text is not None)
 
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
@@ -283,254 +289,3 @@ async def parse_content(
             NotImplementedError: This feature is not yet implemented
         """
         raise NotImplementedError("Image parsing not yet implemented")
-
-
-class AudioParser(BaseParser):
-    """
-    Audio parser - Future implementation.
-
-    Planned Features:
-    1. Speech-to-text transcription using ASR models
-    2. Audio metadata extraction (duration, sample rate, channels)
-    3. Speaker diarization (identify different speakers)
-    4. Timestamp alignment for transcribed text
-    5. Generate structured ResourceNode with transcript
-
-    Example workflow:
-        1. Load audio file
-        2. Extract metadata (duration, format, sample rate)
-        3. Transcribe speech to text using Whisper or similar
-        4. (Optional) Perform speaker diarization
-        5. Create ResourceNode with:
-           - type: NodeType.ROOT
-           - children: sections for each speaker/timestamp
-           - meta: audio metadata and timestamps
-        6. Return ParseResult
-
-    Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A
-    """
-
-    def __init__(self, config: Optional[AudioConfig] = None, **kwargs):
-        """
-        Initialize AudioParser.
-
-        Args:
-            config: Audio parsing configuration
-            **kwargs: Additional configuration parameters
-        """
-        self.config = config or AudioConfig()
-
-    @property
-    def supported_extensions(self) -> List[str]:
-        """Return supported audio file extensions."""
-        return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
-
-    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
-        """
-        Parse audio file - Not yet implemented.
-
-        Planned implementation:
-        1. Load audio file
-        2. Extract metadata using librosa or similar
-        3. If enable_transcription:
-           - Transcribe using Whisper or similar ASR model
-           - Generate timestamps for each segment
-           - (Optional) Perform speaker diarization
-        4. Create ResourceNode tree:
-           - Root node with audio metadata
-           - Child nodes for each transcribed segment
-        5. Return ParseResult
-
-        Args:
-            source: Audio file path or URL
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with transcribed content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError(
-            "Audio parsing is not yet implemented. "
-            "This is a placeholder interface for future expansion. "
-            "\n\nPlanned features:"
-            "\n- Speech-to-text transcription (Whisper)"
-            "\n- Speaker diarization"
-            "\n- Timestamp alignment"
-            "\n- Audio metadata extraction"
-            "\n\nWorkaround: Extract audio manually and add transcripts as "
-            "text or markdown files."
-        )
-
-    async def parse_content(
-        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
-    ) -> ParseResult:
-        """
-        Parse audio from content string - Not yet implemented.
-
-        Args:
-            content: Audio content (base64 or binary string)
-            source_path: Optional source path for metadata
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with transcribed content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError("Audio parsing not yet implemented")
-
-
-class VideoParser(BaseParser):
-    """
-    Video parser - Future implementation.
-
-    Planned Features:
-    1. Key frame extraction at regular intervals
-    2. Audio track transcription using ASR
-    3. VLM-based scene description for key frames
-    4. Video metadata extraction (duration, resolution, codec)
-    5. Generate structured ResourceNode combining visual and audio
-
-    Example workflow:
-        1. Load video file
-        2. Extract metadata (duration, resolution, fps)
-        3. Extract audio track → transcribe using AudioParser
-        4. Extract key frames at specified intervals
-        5. For each frame: generate VLM description
-        6. Create ResourceNode tree:
-           - Root: video metadata
-           - Children: timeline nodes (each with frame + transcript)
-        7. Return ParseResult
-
-    Supported formats: MP4, AVI, MOV, MKV, WEBM
-    """
-
-    def __init__(self, config: Optional[VideoConfig] = None, **kwargs):
-        """
-        Initialize VideoParser.
-
-        Args:
-            config: Video parsing configuration
-            **kwargs: Additional configuration parameters
-        """
-        self.config = config or VideoConfig()
-
-    @property
-    def supported_extensions(self) -> List[str]:
-        """Return supported video file extensions."""
-        return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"]
-
-    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
-        """
-        Parse video file - Not yet implemented.
-
-        Planned implementation:
-        1. Load video file using cv2 or similar
-        2. Extract metadata (duration, resolution, fps, codec)
-        3. Extract audio track:
-           - Save as temporary audio file
-           - Parse using AudioParser
-        4. Extract key frames:
-           - At specified intervals (e.g., every 10 seconds)
-           - Save frames as images
-        5. For each frame (if enable_vlm_description):
-           - Use VLM to generate scene description
-        6. Create ResourceNode tree:
-           - Root: video metadata
-           - Children: Timeline segments
-             - Each segment contains:
-               - Timestamp
-               - Frame description (VLM)
-               - Transcript (ASR)
-        7. Return ParseResult
-
-        Args:
-            source: Video file path or URL
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with video content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError(
-            "Video parsing is not yet implemented. "
-            "This is a placeholder interface for future expansion. "
-            "\n\nPlanned features:"
-            "\n- Key frame extraction"
-            "\n- Audio track transcription"
-            "\n- VLM scene description"
-            "\n- Timeline-based structured output"
-            "\n\nWorkaround: Extract frames and audio manually, then process "
-            "as images and audio files."
-        )
-
-    async def parse_content(
-        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
-    ) -> ParseResult:
-        """
-        Parse video from content string - Not yet implemented.
-
-        Args:
-            content: Video content (base64 or binary string)
-            source_path: Optional source path for metadata
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with video content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError("Video parsing not yet implemented")
-
-
-# =============================================================================
-# Utility Functions
-# =============================================================================
-
-
-def is_media_parser_available(parser_type: str) -> bool:
-    """
-    Check if a media parser type is currently available.
-
-    Args:
-        parser_type: Type of parser ("image", "audio", "video")
-
-    Returns:
-        False (all media parsers are future implementations)
-
-    Examples:
-        >>> is_media_parser_available("image")
-        False
-        >>> is_media_parser_available("video")
-        False
-    """
-    return False
-
-
-def get_media_parser_status() -> dict:
-    """
-    Get status of all media parsers.
-
-    Returns:
-        Dictionary with parser names and their implementation status
-
-    Examples:
-        >>> status = get_media_parser_status()
-        >>> print(status)
-        {
-            "image": "planned",
-            "audio": "planned",
-            "video": "planned"
-        }
-    """
-    return {
-        "image": "planned",
-        "audio": "planned",
-        "video": "planned",
-    }
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
new file mode 100644
index 00000000..807816e1
--- /dev/null
+++ b/openviking/parse/parsers/media/video.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Video parser - Future implementation.
+
+Planned Features:
+1. Key frame extraction at regular intervals
+2. Audio track transcription using ASR
+3. VLM-based scene description for key frames
+4. Video metadata extraction (duration, resolution, codec)
+5. Generate structured ResourceNode combining visual and audio
+
+Example workflow:
+    1. Load video file
+    2. Extract metadata (duration, resolution, fps)
+    3. Extract audio track → transcribe using AudioParser
+    4. Extract key frames at specified intervals
+    5. For each frame: generate VLM description
+    6. Create ResourceNode tree:
+       - Root: video metadata
+       - Children: timeline nodes (each with frame + transcript)
+    7. Return ParseResult
+
+Supported formats: MP4, AVI, MOV, MKV, WEBM
+"""
+
+from pathlib import Path
+from typing import List, Optional, Union
+
+from openviking.parse.base import NodeType, ParseResult, ResourceNode
+from openviking.parse.parsers.base_parser import BaseParser
+from openviking_cli.utils.config.parser_config import VideoConfig
+
+
+class VideoParser(BaseParser):
+    """
+    Video parser for video files.
+    """
+
+    def __init__(self, config: Optional[VideoConfig] = None, **kwargs):
+        """
+        Initialize VideoParser.
+
+        Args:
+            config: Video parsing configuration
+            **kwargs: Additional configuration parameters
+        """
+        self.config = config or VideoConfig()
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        """Return supported video file extensions."""
+        return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"]
+
+    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
+        """
+        Parse video file using three-phase architecture.
+
+        Phase 1: Generate temporary files
+        - Copy original video to temp_uri/content.{ext}
+        - Extract key frames
+        - Generate description.md for each frame using VLM
+        - Extract audio track and transcribe using ASR
+
+        Phase 2: Generate semantic info
+        - Generate abstract and overview based on descriptions
+        - Overview includes file list and usage instructions
+
+        Phase 3: Build directory structure
+        - Move all files to final URI
+        - Generate .abstract.md, .overview.md
+
+        Args:
+            source: Video file path
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with video content
+
+        Raises:
+            FileNotFoundError: If source file does not exist
+            IOError: If video processing fails
+        """
+        from openviking.storage.viking_fs import get_viking_fs
+
+        # Convert to Path object
+        file_path = Path(source) if isinstance(source, str) else source
+        if not file_path.exists():
+            raise FileNotFoundError(f"Video file not found: {source}")
+
+        viking_fs = get_viking_fs()
+        temp_uri = viking_fs.create_temp_uri()
+
+        # Phase 1: Generate temporary files
+        video_bytes = file_path.read_bytes()
+        ext = file_path.suffix
+
+        root_dir_name = file_path.stem
+        root_dir_uri = f"{temp_uri}/{root_dir_name}"
+        await viking_fs.mkdir(root_dir_uri)
+
+        # 1.1 Save original video
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", video_bytes)
+
+        # 1.2 Validate video file using magic bytes
+        # Define magic bytes for supported video formats
+        video_magic_bytes = {
+            ".mp4": [b"\x00\x00\x00", b"ftyp"],
+            ".avi": [b"RIFF"],
+            ".mov": [b"\x00\x00\x00", b"ftyp"],
+            ".mkv": [b"\x1a\x45\xdf\xa3"],
+            ".webm": [b"\x1a\x45\xdf\xa3"],
+            ".flv": [b"FLV"],
+            ".wmv": [b"\x30\x26\xb2\x75\x8e\x66\xcf\x11"],
+        }
+
+        # Check magic bytes
+        valid = False
+        ext_lower = ext.lower()
+        magic_list = video_magic_bytes.get(ext_lower, [])
+        for magic in magic_list:
+            if len(video_bytes) >= len(magic) and video_bytes.startswith(magic):
+                valid = True
+                break
+
+        if not valid:
+            raise ValueError(
+                f"Invalid video file: {file_path}. File signature does not match expected format {ext_lower}"
+            )
+
+        # Extract video metadata (placeholder)
+        duration = 0
+        width = 0
+        height = 0
+        fps = 0
+        format_str = ext[1:].upper()
+
+        # 1.3 Generate combined description
+        description = ""
+        if self.config.enable_key_frames or self.config.enable_audio_transcription:
+            description = await self._generate_video_description(file_path, self.config)
+        else:
+            # Fallback: basic description
+            description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)"
+
+        await viking_fs.write_file(f"{root_dir_uri}/description.md", description)
+
+        # 1.4 Key frames (optional)
+        key_frames_dir = f"{root_dir_uri}/keyframes"
+        has_key_frames = False
+        if self.config.enable_key_frames:
+            await viking_fs.mkdir(key_frames_dir)
+            has_key_frames = True
+
+        # Create ResourceNode
+        root_node = ResourceNode(
+            type=NodeType.ROOT,
+            title=file_path.stem,
+            level=0,
+            detail_file=None,
+            content_path=None,
+            children=[],
+            meta={
+                "duration": duration,
+                "width": width,
+                "height": height,
+                "fps": fps,
+                "format": format_str.lower(),
+                "content_type": "video",
+                "source_title": file_path.stem,
+                "semantic_name": file_path.stem,
+            },
+        )
+
+        # Phase 2: Generate semantic info
+        await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, has_key_frames)
+
+        # Phase 3: Build directory structure (handled by TreeBuilder)
+        return ParseResult(
+            root=root_node,
+            source_path=str(file_path),
+            temp_dir_path=temp_uri,
+            source_format="video",
+            parser_name="VideoParser",
+            meta={"content_type": "video", "format": format_str.lower()},
+        )
+
+    async def _generate_video_description(self, file_path: Path, config: VideoConfig) -> str:
+        """
+        Generate video description using key frames and audio transcription.
+
+        Args:
+            file_path: Video file path
+            config: Video parsing configuration
+
+        Returns:
+            Video description in markdown format
+
+        TODO: Integrate with actual video processing libraries
+        """
+        # Fallback implementation - returns basic placeholder
+        return "Video description (video processing integration pending)\n\nThis is a video. Video processing feature has not yet integrated external libraries."
+
+    async def _generate_semantic_info(
+        self, node: ResourceNode, temp_uri: str, viking_fs, has_key_frames: bool
+    ):
+        """
+        Phase 2: Generate abstract and overview.
+
+        Args:
+            node: ResourceNode to update
+            temp_uri: Temporary URI
+            viking_fs: VikingFS instance
+            has_key_frames: Whether key frames directory exists
+        """
+        # Read description.md
+        description = await viking_fs.read_file(f"{temp_uri}/description.md")
+
+        # Generate abstract (short summary, < 100 tokens)
+        abstract = description[:200] if len(description) > 200 else description
+
+        # Generate overview (content summary + file list + usage instructions)
+        overview_parts = [
+            "## Content Summary\n",
+            description,
+            "\n\n## Available Files\n",
+            f"- content.{node.meta['format']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n",
+            "- description.md: Detailed video description\n",
+        ]
+
+        if has_key_frames:
+            overview_parts.append("- keyframes/: Directory containing extracted key frames\n")
+
+        overview_parts.append("\n## Usage\n")
+        overview_parts.append("### Play Video\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("video_bytes = await video_resource.play()\n")
+        overview_parts.append("# Returns: Video file binary data\n")
+        overview_parts.append("# Purpose: Play or save the video\n")
+        overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get Video Description\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("description = await video_resource.description()\n")
+        overview_parts.append("# Returns: FileContent object for further processing\n")
+        overview_parts.append("# Purpose: Understand video content\n")
+        overview_parts.append("```\n\n")
+
+        if has_key_frames:
+            overview_parts.append("### Get Key Frames\n")
+            overview_parts.append("```python\n")
+            overview_parts.append("keyframes = await video_resource.keyframes()\n")
+            overview_parts.append("# Returns: List of key frame resources\n")
+            overview_parts.append("# Purpose: Analyze video scenes\n")
+            overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get Video Metadata\n")
+        overview_parts.append("```python\n")
+        overview_parts.append(
+            f"duration = video_resource.get_duration()  # {node.meta['duration']}s\n"
+        )
+        overview_parts.append(
+            f"resolution = video_resource.get_resolution()  # ({node.meta['width']}, {node.meta['height']})\n"
+        )
+        overview_parts.append(f"fps = video_resource.get_fps()  # {node.meta['fps']}\n")
+        overview_parts.append(f'format = video_resource.get_format()  # "{node.meta["format"]}"\n')
+        overview_parts.append("```\n")
+
+        overview = "".join(overview_parts)
+
+        # Store in node meta
+        node.meta["abstract"] = abstract
+        node.meta["overview"] = overview
+
+    async def parse_content(
+        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
+    ) -> ParseResult:
+        """
+        Parse video from content string - Not yet implemented.
+
+        Args:
+            content: Video content (base64 or binary string)
+            source_path: Optional source path for metadata
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with video content
+
+        Raises:
+            NotImplementedError: This feature is not yet implemented
+        """
+        raise NotImplementedError("Video parsing from content not yet implemented")
diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py
index dd35062c..11af717f 100644
--- a/openviking/parse/registry.py
+++ b/openviking/parse/registry.py
@@ -74,12 +74,18 @@ def __init__(self, register_optional: bool = True):
         # Register optional media parsers
         if register_optional:
             try:
-                from openviking.parse.parsers.media import ImageParser
+                from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser
 
                 self.register("image", ImageParser())
                 logger.info("Registered ImageParser for image formats")
+
+                self.register("audio", AudioParser())
+                logger.info("Registered AudioParser for audio formats")
+
+                self.register("video", VideoParser())
+                logger.info("Registered VideoParser for video formats")
             except ImportError as e:
-                logger.debug(f"ImageParser not registered: {e}")
+                logger.debug(f"Media parsers not registered: {e}")
 
     def register(self, name: str, parser: BaseParser) -> None:
         """
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 35359044..b4bcb3dd 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -108,14 +108,48 @@ async def finalize_from_temp(
         doc_dirs = [e for e in entries if e.get("isDir") and e["name"] not in [".", ".."]]
 
         if len(doc_dirs) != 1:
-            raise ValueError(f"Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}")
+            logger.error(
+                f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}"
+            )
+            raise ValueError(
+                f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}"
+            )
 
         doc_name = doc_dirs[0]["name"]
         doc_uri = f"{temp_uri}/{doc_name}"
 
         # 2. Determine base_uri
         if base_uri is None:
-            base_uri = self._get_base_uri(scope)
+            # Check if it's a media file (image/audio/video)
+            media_type = None
+            if source_format:
+                if source_format in ["image", "audio", "video"]:
+                    media_type = source_format
+            elif source_path:
+                from pathlib import Path
+
+                ext = Path(source_path).suffix.lower()
+                image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
+                audio_exts = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
+                video_exts = [".mp4", ".mov", ".avi", ".webm", ".mkv"]
+                if ext in image_exts:
+                    media_type = "image"
+                elif ext in audio_exts:
+                    media_type = "audio"
+                elif ext in video_exts:
+                    media_type = "video"
+
+            if media_type:
+                # Map singular media types to plural directory names
+                media_dir_map = {"image": "images", "audio": "audio", "video": "video"}
+                media_dir = media_dir_map.get(media_type, media_type)
+                # Get current date in YYYYMMDD format
+                from datetime import datetime
+
+                date_str = datetime.now().strftime("%Y%m%d")
+                base_uri = f"viking://resources/{media_dir}/{date_str}"
+            else:
+                base_uri = self._get_base_uri(scope)
 
         logger.info(f"Finalizing from temp: {temp_uri} -> {base_uri}")
 

From 2f3ec8c6d8a2bf3759d26a8d31a9843b0b3a2563 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 13:30:30 +0800
Subject: [PATCH 04/18] Enhance CLI robustness: validate add-resource path
 exists and detect unquoted spaces

---
 crates/ov_cli/src/main.rs                | 25 ++++++++++++
 openviking_cli/cli/commands/resources.py | 50 ++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs
index b98aaf4c..4c2eb9be 100644
--- a/crates/ov_cli/src/main.rs
+++ b/crates/ov_cli/src/main.rs
@@ -458,6 +458,31 @@ async fn handle_add_resource(
     timeout: Option<f64>,
     ctx: CliContext,
 ) -> Result<()> {
+    // Validate path: if it's a local path, check if it exists
+    if !path.starts_with("http://") && !path.starts_with("https://") {
+        use std::path::Path;
+        
+        let path_obj = Path::new(&path);
+        if !path_obj.exists() {
+            eprintln!("Error: Path '{}' does not exist.", path);
+            
+            // Check if there might be unquoted spaces
+            use std::env;
+            let args: Vec<String> = env::args().collect();
+            
+            if let Some(add_resource_pos) = args.iter().position(|s| s == "add-resource" || s == "add") {
+                if args.len() > add_resource_pos + 2 {
+                    let extra_args = &args[add_resource_pos + 2..];
+                    let suggested_path = format!("{} {}", path, extra_args.join(" "));
+                    eprintln!("\nIt looks like you may have forgotten to quote a path with spaces.");
+                    eprintln!("Suggested command: ov add-resource \"{}\"", suggested_path);
+                }
+            }
+            
+            std::process::exit(1);
+        }
+    }
+    
     let client = ctx.get_client();
     commands::resources::add_resource(
         &client, &path, to, reason, instruction, wait, timeout, ctx.output_format, ctx.compact
diff --git a/openviking_cli/cli/commands/resources.py b/openviking_cli/cli/commands/resources.py
index 92940dd7..cd72b76a 100644
--- a/openviking_cli/cli/commands/resources.py
+++ b/openviking_cli/cli/commands/resources.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Resource management commands."""
 
+from pathlib import Path
 from typing import Optional
 
 import typer
@@ -23,6 +24,55 @@ def add_resource_command(
         timeout: Optional[float] = typer.Option(600.0, help="Wait timeout in seconds"),
     ) -> None:
         """Add resources into OpenViking."""
+        # Validate path: if it's a local path, check if it exists
+        if not (path.startswith("http://") or path.startswith("https://")):
+            local_path = Path(path)
+            if not local_path.exists():
+                # Check if there are extra arguments (possible unquoted path with spaces)
+                import sys
+
+                # Find the index of 'add-resource' in sys.argv
+                try:
+                    add_resource_idx = sys.argv.index("add-resource")
+                except ValueError:
+                    add_resource_idx = sys.argv.index("add") if "add" in sys.argv else -1
+
+                if add_resource_idx != -1 and len(sys.argv) > add_resource_idx + 2:
+                    # There are extra positional arguments - likely unquoted path with spaces
+                    extra_args = sys.argv[add_resource_idx + 2 :]
+                    suggested_path = f"{path} {' '.join(extra_args)}"
+                    typer.echo(
+                        typer.style(
+                            f"Error: Path '{path}' does not exist.",
+                            fg=typer.colors.RED,
+                            bold=True,
+                        ),
+                        err=True,
+                    )
+                    typer.echo(
+                        typer.style(
+                            "\nIt looks like you may have forgotten to quote a path with spaces.",
+                            fg=typer.colors.YELLOW,
+                        ),
+                        err=True,
+                    )
+                    typer.echo(
+                        typer.style(
+                            f'Suggested command: ov add-resource "{suggested_path}"',
+                            fg=typer.colors.CYAN,
+                        ),
+                        err=True,
+                    )
+                    raise typer.Exit(code=1)
+                else:
+                    typer.echo(
+                        typer.style(
+                            f"Error: Path '{path}' does not exist.", fg=typer.colors.RED, bold=True
+                        ),
+                        err=True,
+                    )
+                    raise typer.Exit(code=1)
+
         run(
             ctx,
             lambda client: client.add_resource(

From c4b5960a6a94455ca5d79d41ffe57a02a65bad37 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 13:37:40 +0800
Subject: [PATCH 05/18] Fix unescaped spaces in paths by replacing \  with
 space

---
 crates/ov_cli/src/main.rs                | 7 +++++--
 openviking_cli/cli/commands/resources.py | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs
index 4c2eb9be..e245a0e4 100644
--- a/crates/ov_cli/src/main.rs
+++ b/crates/ov_cli/src/main.rs
@@ -450,7 +450,7 @@ async fn main() {
 }
 
 async fn handle_add_resource(
-    path: String,
+    mut path: String,
     to: Option<String>,
     reason: String,
     instruction: String,
@@ -462,7 +462,9 @@ async fn handle_add_resource(
     if !path.starts_with("http://") && !path.starts_with("https://") {
         use std::path::Path;
         
-        let path_obj = Path::new(&path);
+        // Unescape path: replace backslash followed by space with just space
+        let unescaped_path = path.replace("\\ ", " ");
+        let path_obj = Path::new(&unescaped_path);
         if !path_obj.exists() {
             eprintln!("Error: Path '{}' does not exist.", path);
             
@@ -481,6 +483,7 @@ async fn handle_add_resource(
             
             std::process::exit(1);
         }
+        path = unescaped_path;
     }
     
     let client = ctx.get_client();
diff --git a/openviking_cli/cli/commands/resources.py b/openviking_cli/cli/commands/resources.py
index cd72b76a..a9bfc28f 100644
--- a/openviking_cli/cli/commands/resources.py
+++ b/openviking_cli/cli/commands/resources.py
@@ -25,8 +25,11 @@ def add_resource_command(
     ) -> None:
         """Add resources into OpenViking."""
         # Validate path: if it's a local path, check if it exists
+        final_path = path
         if not (path.startswith("http://") or path.startswith("https://")):
-            local_path = Path(path)
+            unescaped_path = path.replace("\\ ", " ")
+            local_path = Path(unescaped_path)
+            final_path = unescaped_path
             if not local_path.exists():
                 # Check if there are extra arguments (possible unquoted path with spaces)
                 import sys
@@ -76,7 +79,7 @@ def add_resource_command(
         run(
             ctx,
             lambda client: client.add_resource(
-                path=path,
+                path=final_path,
                 target=to,
                 reason=reason,
                 instruction=instruction,

From da4386ad4f5aa29d50688c3dc50f15705a6599f4 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 13:42:51 +0800
Subject: [PATCH 06/18] Sanitize URI components to replace spaces and special
 chars with underscores

---
 openviking/parse/parsers/media/audio.py | 4 +++-
 openviking/parse/parsers/media/image.py | 4 +++-
 openviking/parse/parsers/media/video.py | 4 +++-
 openviking/parse/tree_builder.py        | 4 +++-
 openviking_cli/utils/uri.py             | 4 ++--
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
index 372a0cab..e60e7fbe 100644
--- a/openviking/parse/parsers/media/audio.py
+++ b/openviking/parse/parsers/media/audio.py
@@ -94,7 +94,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         audio_bytes = file_path.read_bytes()
         ext = file_path.suffix
 
-        root_dir_name = file_path.stem
+        from openviking_cli.utils.uri import VikingURI
+
+        root_dir_name = VikingURI.sanitize_segment(file_path.stem)
         root_dir_uri = f"{temp_uri}/{root_dir_name}"
         await viking_fs.mkdir(root_dir_uri)
 
diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py
index 0965b730..adde531c 100644
--- a/openviking/parse/parsers/media/image.py
+++ b/openviking/parse/parsers/media/image.py
@@ -106,7 +106,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         image_bytes = file_path.read_bytes()
         ext = file_path.suffix
 
-        root_dir_name = file_path.stem
+        from openviking_cli.utils.uri import VikingURI
+
+        root_dir_name = VikingURI.sanitize_segment(file_path.stem)
         root_dir_uri = f"{temp_uri}/{root_dir_name}"
         await viking_fs.mkdir(root_dir_uri)
 
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
index 807816e1..fe50776d 100644
--- a/openviking/parse/parsers/media/video.py
+++ b/openviking/parse/parsers/media/video.py
@@ -95,7 +95,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         video_bytes = file_path.read_bytes()
         ext = file_path.suffix
 
-        root_dir_name = file_path.stem
+        from openviking_cli.utils.uri import VikingURI
+
+        root_dir_name = VikingURI.sanitize_segment(file_path.stem)
         root_dir_uri = f"{temp_uri}/{root_dir_name}"
         await viking_fs.mkdir(root_dir_uri)
 
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index b4bcb3dd..11ff07be 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -115,7 +115,9 @@ async def finalize_from_temp(
                 f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}"
             )
 
-        doc_name = doc_dirs[0]["name"]
+        from openviking_cli.utils.uri import VikingURI
+
+        doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"])
         doc_uri = f"{temp_uri}/{doc_name}"
 
         # 2. Determine base_uri
diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py
index 9cf6d856..efa8744f 100644
--- a/openviking_cli/utils/uri.py
+++ b/openviking_cli/utils/uri.py
@@ -201,7 +201,7 @@ def build_semantic_uri(
         Build a semantic URI based on parent URI.
         """
         # Sanitize semantic name for URI
-        safe_name = VikingURI._sanitize_segment(semantic_name)
+        safe_name = VikingURI.sanitize_segment(semantic_name)
 
         if not is_leaf:
             return f"{parent_uri}/{safe_name}"
@@ -211,7 +211,7 @@ def build_semantic_uri(
             return f"{parent_uri}/{safe_name}/{node_id}"
 
     @staticmethod
-    def _sanitize_segment(text: str) -> str:
+    def sanitize_segment(text: str) -> str:
         """
         Sanitize text for use in URI segment.
 

From b2fdfc0225fa3db35ebdefbfb31e2f733c40f34a Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 16:15:40 +0800
Subject: [PATCH 07/18] feat: auto organize audio and image and video files

---
 openviking/parse/parsers/html.py        |  6 +++-
 openviking/parse/parsers/markdown.py    |  6 +++-
 openviking/parse/parsers/media/audio.py | 22 +++----------
 openviking/parse/parsers/media/image.py | 22 +++----------
 openviking/parse/parsers/media/video.py | 20 ++---------
 openviking_cli/client/http.py           | 27 +++++++++++++++
 openviking_cli/utils/uri.py             | 44 +++++++++++++++++++++++--
 7 files changed, 89 insertions(+), 58 deletions(-)

diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
index 28e47885..85fd0c0a 100644
--- a/openviking/parse/parsers/html.py
+++ b/openviking/parse/parsers/html.py
@@ -601,6 +601,10 @@ async def parse_content(
 
     def _sanitize_for_path(self, text: str) -> str:
         """Sanitize text for use in file path."""
-        safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text)
+        safe = re.sub(
+            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]",
+            "",
+            text,
+        )
         safe = re.sub(r"\s+", "_", safe)
         return safe.strip("_")[:50] or "section"
diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py
index e6ddbe3a..1570baf3 100644
--- a/openviking/parse/parsers/markdown.py
+++ b/openviking/parse/parsers/markdown.py
@@ -334,7 +334,11 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
         return parts if parts else [content]
 
     def _sanitize_for_path(self, text: str) -> str:
-        safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text)
+        safe = re.sub(
+            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]",
+            "",
+            text,
+        )
         safe = re.sub(r"\s+", "_", safe)
         return safe.strip("_")[:50] or "section"
 
diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
index e60e7fbe..cfb7eaab 100644
--- a/openviking/parse/parsers/media/audio.py
+++ b/openviking/parse/parsers/media/audio.py
@@ -58,11 +58,10 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         Phase 1: Generate temporary files
         - Copy original audio to temp_uri/content.{ext}
-        - Generate description.md using ASR
         - (Optional) Generate transcript with timestamps
 
         Phase 2: Generate semantic info
-        - Generate abstract and overview based on description.md
+        - Generate abstract and overview based on description
         - Overview includes file list and usage instructions
 
         Phase 3: Build directory structure
@@ -143,8 +142,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             # Fallback: basic description
             description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)"
 
-        await viking_fs.write_file(f"{root_dir_uri}/description.md", description)
-
         # 1.4 Transcript with timestamps (optional)
         transcript_text = None
         if self.config.enable_transcription and self.config.enable_timestamps:
@@ -175,7 +172,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         # Phase 2: Generate semantic info
         await self._generate_semantic_info(
-            root_node, root_dir_uri, viking_fs, transcript_text is not None
+            root_node, description, viking_fs, transcript_text is not None
         )
 
         # Phase 3: Build directory structure (handled by TreeBuilder)
@@ -223,20 +220,17 @@ async def _asr_transcribe_with_timestamps(
         return None
 
     async def _generate_semantic_info(
-        self, node: ResourceNode, temp_uri: str, viking_fs, has_transcript: bool
+        self, node: ResourceNode, description: str, viking_fs, has_transcript: bool
     ):
         """
         Phase 2: Generate abstract and overview.
 
         Args:
             node: ResourceNode to update
-            temp_uri: Temporary URI
+            description: Audio description
             viking_fs: VikingFS instance
             has_transcript: Whether transcript file exists
         """
-        # Read description.md
-        description = await viking_fs.read_file(f"{temp_uri}/description.md")
-
         # Generate abstract (short summary, < 100 tokens)
         abstract = description[:200] if len(description) > 200 else description
 
@@ -246,7 +240,6 @@ async def _generate_semantic_info(
             description,
             "\n\n## Available Files\n",
             f"- content.{node.meta['format']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n",
-            "- description.md: Detailed audio transcription generated by ASR\n",
         ]
 
         if has_transcript:
@@ -260,13 +253,6 @@ async def _generate_semantic_info(
         overview_parts.append("# Purpose: Play or save the audio\n")
         overview_parts.append("```\n\n")
 
-        overview_parts.append("### Get ASR-generated Transcription\n")
-        overview_parts.append("```python\n")
-        overview_parts.append("transcription = await audio_resource.transcription()\n")
-        overview_parts.append("# Returns: FileContent object for further processing\n")
-        overview_parts.append("# Purpose: Understand audio content\n")
-        overview_parts.append("```\n\n")
-
         if has_transcript:
             overview_parts.append("### Get Timestamps Transcript\n")
             overview_parts.append("```python\n")
diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py
index adde531c..544ba80c 100644
--- a/openviking/parse/parsers/media/image.py
+++ b/openviking/parse/parsers/media/image.py
@@ -70,11 +70,10 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         Phase 1: Generate temporary files
         - Copy original image to temp_uri/content.{ext}
-        - Generate description.md using VLM
         - (Optional) Generate ocr.md using OCR
 
         Phase 2: Generate semantic info
-        - Generate abstract and overview based on description.md
+        - Generate abstract and overview based on description
         - Overview includes file list and usage instructions
 
         Phase 3: Build directory structure
@@ -134,8 +133,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             # Fallback: basic description
             description = f"Image file: {file_path.name} ({format_str}, {width}x{height})"
 
-        await viking_fs.write_file(f"{root_dir_uri}/description.md", description)
-
         # 1.4 OCR (optional)
         ocr_text = None
         if self.config.enable_ocr:
@@ -162,7 +159,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         )
 
         # Phase 2: Generate semantic info
-        await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, ocr_text is not None)
+        await self._generate_semantic_info(root_node, description, viking_fs, ocr_text is not None)
 
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
@@ -207,20 +204,17 @@ async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]:
         return None
 
     async def _generate_semantic_info(
-        self, node: ResourceNode, temp_uri: str, viking_fs, has_ocr: bool
+        self, node: ResourceNode, description: str, viking_fs, has_ocr: bool
     ):
         """
         Phase 2: Generate abstract and overview.
 
         Args:
             node: ResourceNode to update
-            temp_uri: Temporary URI
+            description: Image description
             viking_fs: VikingFS instance
             has_ocr: Whether OCR file exists
         """
-        # Read description.md
-        description = await viking_fs.read_file(f"{temp_uri}/description.md")
-
         # Generate abstract (short summary, < 100 tokens)
         abstract = description[:200] if len(description) > 200 else description
 
@@ -230,7 +224,6 @@ async def _generate_semantic_info(
             description,
             "\n\n## Available Files\n",
             f"- content.{node.meta['format']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n",
-            "- description.md: Detailed image description generated by VLM\n",
         ]
 
         if has_ocr:
@@ -244,13 +237,6 @@ async def _generate_semantic_info(
         overview_parts.append("# Purpose: Display or save the image\n")
         overview_parts.append("```\n\n")
 
-        overview_parts.append("### Get VLM-generated Image Description\n")
-        overview_parts.append("```python\n")
-        overview_parts.append("description = await image_resource.description()\n")
-        overview_parts.append("# Returns: FileContent object for further processing\n")
-        overview_parts.append("# Purpose: Understand image content\n")
-        overview_parts.append("```\n\n")
-
         if has_ocr:
             overview_parts.append("### Get OCR-recognized Text\n")
             overview_parts.append("```python\n")
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
index fe50776d..ea274e2c 100644
--- a/openviking/parse/parsers/media/video.py
+++ b/openviking/parse/parsers/media/video.py
@@ -59,7 +59,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         Phase 1: Generate temporary files
         - Copy original video to temp_uri/content.{ext}
         - Extract key frames
-        - Generate description.md for each frame using VLM
         - Extract audio track and transcribe using ASR
 
         Phase 2: Generate semantic info
@@ -145,8 +144,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             # Fallback: basic description
             description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)"
 
-        await viking_fs.write_file(f"{root_dir_uri}/description.md", description)
-
         # 1.4 Key frames (optional)
         key_frames_dir = f"{root_dir_uri}/keyframes"
         has_key_frames = False
@@ -175,7 +172,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         )
 
         # Phase 2: Generate semantic info
-        await self._generate_semantic_info(root_node, root_dir_uri, viking_fs, has_key_frames)
+        await self._generate_semantic_info(root_node, description, viking_fs, has_key_frames)
 
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
@@ -204,20 +201,17 @@ async def _generate_video_description(self, file_path: Path, config: VideoConfig
         return "Video description (video processing integration pending)\n\nThis is a video. Video processing feature has not yet integrated external libraries."
 
     async def _generate_semantic_info(
-        self, node: ResourceNode, temp_uri: str, viking_fs, has_key_frames: bool
+        self, node: ResourceNode, description: str, viking_fs, has_key_frames: bool
     ):
         """
         Phase 2: Generate abstract and overview.
 
         Args:
             node: ResourceNode to update
-            temp_uri: Temporary URI
+            description: Video description
             viking_fs: VikingFS instance
             has_key_frames: Whether key frames directory exists
         """
-        # Read description.md
-        description = await viking_fs.read_file(f"{temp_uri}/description.md")
-
         # Generate abstract (short summary, < 100 tokens)
         abstract = description[:200] if len(description) > 200 else description
 
@@ -227,7 +221,6 @@ async def _generate_semantic_info(
             description,
             "\n\n## Available Files\n",
             f"- content.{node.meta['format']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n",
-            "- description.md: Detailed video description\n",
         ]
 
         if has_key_frames:
@@ -241,13 +234,6 @@ async def _generate_semantic_info(
         overview_parts.append("# Purpose: Play or save the video\n")
         overview_parts.append("```\n\n")
 
-        overview_parts.append("### Get Video Description\n")
-        overview_parts.append("```python\n")
-        overview_parts.append("description = await video_resource.description()\n")
-        overview_parts.append("# Returns: FileContent object for further processing\n")
-        overview_parts.append("# Purpose: Understand video content\n")
-        overview_parts.append("```\n\n")
-
         if has_key_frames:
             overview_parts.append("### Get Key Frames\n")
             overview_parts.append("```python\n")
diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py
index 39526602..a5cb6903 100644
--- a/openviking_cli/client/http.py
+++ b/openviking_cli/client/http.py
@@ -36,6 +36,7 @@
     load_json_config,
     resolve_config_path,
 )
+from openviking_cli.utils.uri import VikingURI
 
 # Error code to exception class mapping
 ERROR_CODE_TO_EXCEPTION = {
@@ -281,6 +282,7 @@ async def ls(
         node_limit: int = 1000,
     ) -> List[Any]:
         """List directory contents."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/fs/ls",
             params={
@@ -304,6 +306,7 @@ async def tree(
         node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """Get directory tree."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/fs/tree",
             params={
@@ -318,6 +321,7 @@ async def tree(
 
     async def stat(self, uri: str) -> Dict[str, Any]:
         """Get resource status."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/fs/stat",
             params={"uri": uri},
@@ -326,6 +330,7 @@ async def stat(self, uri: str) -> Dict[str, Any]:
 
     async def mkdir(self, uri: str) -> None:
         """Create directory."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/fs/mkdir",
             json={"uri": uri},
@@ -334,6 +339,7 @@ async def mkdir(self, uri: str) -> None:
 
     async def rm(self, uri: str, recursive: bool = False) -> None:
         """Remove resource."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.request(
             "DELETE",
             "/api/v1/fs",
@@ -343,6 +349,8 @@ async def rm(self, uri: str, recursive: bool = False) -> None:
 
     async def mv(self, from_uri: str, to_uri: str) -> None:
         """Move resource."""
+        from_uri = VikingURI.normalize(from_uri)
+        to_uri = VikingURI.normalize(to_uri)
         response = await self._http.post(
             "/api/v1/fs/mv",
             json={"from_uri": from_uri, "to_uri": to_uri},
@@ -353,6 +361,7 @@ async def mv(self, from_uri: str, to_uri: str) -> None:
 
     async def read(self, uri: str) -> str:
         """Read file content."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/content/read",
             params={"uri": uri},
@@ -361,6 +370,7 @@ async def read(self, uri: str) -> str:
 
     async def abstract(self, uri: str) -> str:
         """Read L0 abstract."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/content/abstract",
             params={"uri": uri},
@@ -369,6 +379,7 @@ async def abstract(self, uri: str) -> str:
 
     async def overview(self, uri: str) -> str:
         """Read L1 overview."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/content/overview",
             params={"uri": uri},
@@ -386,6 +397,8 @@ async def find(
         filter: Optional[Dict[str, Any]] = None,
     ) -> FindResult:
         """Semantic search without session context."""
+        if target_uri:
+            target_uri = VikingURI.normalize(target_uri)
         response = await self._http.post(
             "/api/v1/search/find",
             json={
@@ -409,6 +422,8 @@ async def search(
         filter: Optional[Dict[str, Any]] = None,
     ) -> FindResult:
         """Semantic search with optional session context."""
+        if target_uri:
+            target_uri = VikingURI.normalize(target_uri)
         sid = session_id or (session.session_id if session else None)
         response = await self._http.post(
             "/api/v1/search/search",
@@ -425,6 +440,7 @@ async def search(
 
     async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) -> Dict[str, Any]:
         """Content search with pattern."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/search/grep",
             json={
@@ -437,6 +453,7 @@ async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) ->
 
     async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]:
         """File pattern matching."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/search/glob",
             json={"pattern": pattern, "uri": uri},
@@ -447,6 +464,7 @@ async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]:
 
     async def relations(self, uri: str) -> List[Any]:
         """Get relations for a resource."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/relations",
             params={"uri": uri},
@@ -455,6 +473,11 @@ async def relations(self, uri: str) -> List[Any]:
 
     async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str = "") -> None:
         """Create link between resources."""
+        from_uri = VikingURI.normalize(from_uri)
+        if isinstance(to_uris, str):
+            to_uris = VikingURI.normalize(to_uris)
+        else:
+            to_uris = [VikingURI.normalize(u) for u in to_uris]
         response = await self._http.post(
             "/api/v1/relations/link",
             json={"from_uri": from_uri, "to_uris": to_uris, "reason": reason},
@@ -463,6 +486,8 @@ async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str
 
     async def unlink(self, from_uri: str, to_uri: str) -> None:
         """Remove link between resources."""
+        from_uri = VikingURI.normalize(from_uri)
+        to_uri = VikingURI.normalize(to_uri)
         response = await self._http.request(
             "DELETE",
             "/api/v1/relations/link",
@@ -512,6 +537,7 @@ async def add_message(self, session_id: str, role: str, content: str) -> Dict[st
 
     async def export_ovpack(self, uri: str, to: str) -> str:
         """Export context as .ovpack file."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/pack/export",
             json={"uri": uri, "to": to},
@@ -527,6 +553,7 @@ async def import_ovpack(
         vectorize: bool = True,
     ) -> str:
         """Import .ovpack file."""
+        parent = VikingURI.normalize(parent)
         response = await self._http.post(
             "/api/v1/pack/import",
             json={
diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py
index efa8744f..6f50a3a9 100644
--- a/openviking_cli/utils/uri.py
+++ b/openviking_cli/utils/uri.py
@@ -215,7 +215,8 @@ def sanitize_segment(text: str) -> str:
         """
         Sanitize text for use in URI segment.
 
-        Preserves Chinese characters but replaces special characters.
+        Preserves CJK characters (Chinese, Japanese, Korean) and other common scripts
+        while replacing special characters.
 
         Args:
             text: Original text
@@ -223,8 +224,18 @@ def sanitize_segment(text: str) -> str:
         Returns:
             URI-safe string
         """
-        # Preserve Chinese characters, letters, numbers, underscores, hyphens
-        safe = re.sub(r"[^\w\u4e00-\u9fff\-]", "_", text)
+        # Preserve:
+        # - Letters, numbers, underscores, hyphens (\w includes [a-zA-Z0-9_])
+        # - CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja)
+        # - Hiragana and Katakana (Japanese)
+        # - Hangul Syllables (Korean)
+        # - CJK Unified Ideographs Extension A
+        # - CJK Unified Ideographs Extension B
+        safe = re.sub(
+            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-]",
+            "_",
+            text,
+        )
         # Merge consecutive underscores
         safe = re.sub(r"_+", "_", safe)
         # Strip leading/trailing underscores and limit length
@@ -245,6 +256,33 @@ def __eq__(self, other) -> bool:
     def __hash__(self) -> int:
         return hash(self.uri)
 
+    @staticmethod
+    def normalize(uri: str) -> str:
+        """
+        Normalize URI by ensuring it has the viking:// scheme.
+
+        If the input already starts with viking://, returns it as-is.
+        If it starts with /, prepends viking:// (resulting in viking:///... which is invalid,
+        so we strip leading / first).
+        Otherwise, prepends viking://.
+
+        Examples:
+            "/resources/images" -> "viking://resources/images"
+            "resources/images" -> "viking://resources/images"
+            "viking://resources/images" -> "viking://resources/images"
+
+        Args:
+            uri: Input URI string
+
+        Returns:
+            Normalized URI with viking:// scheme
+        """
+        if uri.startswith(f"{VikingURI.SCHEME}://"):
+            return uri
+        # Strip leading slashes
+        uri = uri.lstrip("/")
+        return f"{VikingURI.SCHEME}://{uri}"
+
     @classmethod
     def create_temp_uri(cls) -> str:
         """Create temp directory URI like viking://temp/MMDDHHMM_XXXXXX"""

From 1118919d6d580963c4482bee497d969154c935a3 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 16:41:05 +0800
Subject: [PATCH 08/18] Update media parsers to use original filenames and
 folder names with extensions

---
 openviking/parse/parsers/README.md      |  5 +++--
 openviking/parse/parsers/media/audio.py | 14 ++++++++++----
 openviking/parse/parsers/media/image.py | 14 ++++++++++----
 openviking/parse/parsers/media/video.py | 14 ++++++++++----
 4 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md
index 7f452eb0..34eafd2e 100644
--- a/openviking/parse/parsers/README.md
+++ b/openviking/parse/parsers/README.md
@@ -155,14 +155,15 @@ L1: """
   * viking://resource/video 用于存储提交时未明确指定目标路径的视频文件
 * 对于每个媒体子目录下，每次上传的文件放在当前日期（而非文件内部元信息时间）的子目录下，例如：
   * viking://resource/images/20240820/ 内存储 20240820 上传的所有图片文件
-* 对于每个多媒体文件，默认创建一个文件夹，文件夹名称与文件名想同但默认不包含后缀，例如：
-  * 上传文件 `20240820_123456.jpg` 后，默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456` 用于存储该文件的相关内容
+* 对于每个多媒体文件，默认创建一个文件夹，文件夹名称与文件名想同但默认包含后缀，例如：
+  * 上传文件 `20240820_123456.jpg` 后，默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456_jpg` 用于存储该文件的相关内容
   * 该文件夹内默认包含一个 `.abstract.md` 文件，用于存储该文件的摘要信息
     * 例如：图片文件的摘要可能是图片的文件名、内容描述、画面风格等，正常不应超过 200 token
   * 该文件夹内默认包含一个 `.overview.md` 文件，用于存储该文件的概览内容，例如：
     * 图片文件的概览内容除了包含 `.abstract.md` 中的内容，还可能包含图片的尺寸、画面风格、OCR 识别结果、场景和主体描述等
     * 音频文件的概览内容可能包含音频的文件名、时长、语音或歌词识别结果，以及主要的章节对应的时间线等
     * 视频文件的概览内容可能包含视频的文件名、时长、使用场景等，对于较大的视频，未来会对视频进行切分，可能会继续递归用子文件夹存储切分后的视频文件、音轨文件、关键画面的截图等，因此视频的处理逻辑预期较为复杂，可等待图片、音频实现后，参考文件夹或 zip 的递归处理形态进行处理。
+ * 该文件夹内需要放置原始文件，保持原始文件名，例如 `20240820_123456.jpg`，但如果文件名包含空格字符，需要将其替换为下划线 `_`，因为 OpenViking URI 不允许包含空格字符。
 
 
 ## 核心组件
diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
index cfb7eaab..f0a018c3 100644
--- a/openviking/parse/parsers/media/audio.py
+++ b/openviking/parse/parsers/media/audio.py
@@ -95,12 +95,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         from openviking_cli.utils.uri import VikingURI
 
-        root_dir_name = VikingURI.sanitize_segment(file_path.stem)
+        # Sanitize original filename (replace spaces with underscores)
+        original_filename = file_path.name.replace(" ", "_")
+        # Root directory name: filename stem + _ + extension (without dot)
+        stem = file_path.stem.replace(" ", "_")
+        ext_no_dot = ext[1:] if ext else ""
+        root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
         root_dir_uri = f"{temp_uri}/{root_dir_name}"
         await viking_fs.mkdir(root_dir_uri)
 
-        # 1.1 Save original audio
-        await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", audio_bytes)
+        # 1.1 Save original audio with original filename (sanitized)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", audio_bytes)
 
         # 1.2 Validate audio file using magic bytes
         # Define magic bytes for supported audio formats
@@ -167,6 +172,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
                 "content_type": "audio",
                 "source_title": file_path.stem,
                 "semantic_name": file_path.stem,
+                "original_filename": original_filename,
             },
         )
 
@@ -239,7 +245,7 @@ async def _generate_semantic_info(
             "## Content Summary\n",
             description,
             "\n\n## Available Files\n",
-            f"- content.{node.meta['format']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n",
+            f"- {node.meta['original_filename']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n",
         ]
 
         if has_transcript:
diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py
index 544ba80c..c82b9589 100644
--- a/openviking/parse/parsers/media/image.py
+++ b/openviking/parse/parsers/media/image.py
@@ -107,12 +107,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         from openviking_cli.utils.uri import VikingURI
 
-        root_dir_name = VikingURI.sanitize_segment(file_path.stem)
+        # Sanitize original filename (replace spaces with underscores)
+        original_filename = file_path.name.replace(" ", "_")
+        # Root directory name: filename stem + _ + extension (without dot)
+        stem = file_path.stem.replace(" ", "_")
+        ext_no_dot = ext[1:] if ext else ""
+        root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
         root_dir_uri = f"{temp_uri}/{root_dir_name}"
         await viking_fs.mkdir(root_dir_uri)
 
-        # 1.1 Save original image
-        await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", image_bytes)
+        # 1.1 Save original image with original filename (sanitized)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", image_bytes)
 
         # 1.2 Validate and extract image metadata
         try:
@@ -155,6 +160,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
                 "content_type": "image",
                 "source_title": file_path.stem,
                 "semantic_name": file_path.stem,
+                "original_filename": original_filename,
             },
         )
 
@@ -223,7 +229,7 @@ async def _generate_semantic_info(
             "## Content Summary\n",
             description,
             "\n\n## Available Files\n",
-            f"- content.{node.meta['format']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n",
+            f"- {node.meta['original_filename']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n",
         ]
 
         if has_ocr:
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
index ea274e2c..53cccf67 100644
--- a/openviking/parse/parsers/media/video.py
+++ b/openviking/parse/parsers/media/video.py
@@ -96,12 +96,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         from openviking_cli.utils.uri import VikingURI
 
-        root_dir_name = VikingURI.sanitize_segment(file_path.stem)
+        # Sanitize original filename (replace spaces with underscores)
+        original_filename = file_path.name.replace(" ", "_")
+        # Root directory name: filename stem + _ + extension (without dot)
+        stem = file_path.stem.replace(" ", "_")
+        ext_no_dot = ext[1:] if ext else ""
+        root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
         root_dir_uri = f"{temp_uri}/{root_dir_name}"
         await viking_fs.mkdir(root_dir_uri)
 
-        # 1.1 Save original video
-        await viking_fs.write_file_bytes(f"{root_dir_uri}/content{ext}", video_bytes)
+        # 1.1 Save original video with original filename (sanitized)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", video_bytes)
 
         # 1.2 Validate video file using magic bytes
         # Define magic bytes for supported video formats
@@ -168,6 +173,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
                 "content_type": "video",
                 "source_title": file_path.stem,
                 "semantic_name": file_path.stem,
+                "original_filename": original_filename,
             },
         )
 
@@ -220,7 +226,7 @@ async def _generate_semantic_info(
             "## Content Summary\n",
             description,
             "\n\n## Available Files\n",
-            f"- content.{node.meta['format']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n",
+            f"- {node.meta['original_filename']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n",
         ]
 
         if has_key_frames:

From b1fc5915e816eeaa051293a6f9f64bb01f503dde Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Mon, 16 Feb 2026 17:05:25 +0800
Subject: [PATCH 09/18] Optimize MediaParser section for readability

---
 openviking/parse/parsers/README.md | 72 +++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 17 deletions(-)

diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md
index 34eafd2e..94fcc23c 100644
--- a/openviking/parse/parsers/README.md
+++ b/openviking/parse/parsers/README.md
@@ -144,26 +144,64 @@ L1: """
 代码解析器，支持语法高亮和代码结构分析。能识别函数、类、方法等代码元素。
 
 ### 6. MediaParser (`media.py`)
-**支持格式**: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.mp4`, `.mov`, `.avi`, `.webm`, `.mp3`, `.wav`, `.m4a`, `.flac`
+
+**支持格式**:
+- 图片: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`
+- 视频: `.mp4`, `.mov`, `.avi`, `.webm`
+- 音频: `.mp3`, `.wav`, `.m4a`, `.flac`
 
 多媒体解析器，使用 VLM（视觉语言模型）分析图像、视频和音频内容，生成文本描述。
 
-对于添加多媒体文件的存储组织方式，我们采用以下策略：
-* 在 viking://resource 下创建 images, audio, video 三个媒体子目录，分别是：
-  * viking://resource/images 用于存储提交时未明确指定目标路径的图片文件
-  * viking://resource/audio 用于存储提交时未明确指定目标路径的音频文件
-  * viking://resource/video 用于存储提交时未明确指定目标路径的视频文件
-* 对于每个媒体子目录下，每次上传的文件放在当前日期（而非文件内部元信息时间）的子目录下，例如：
-  * viking://resource/images/20240820/ 内存储 20240820 上传的所有图片文件
-* 对于每个多媒体文件，默认创建一个文件夹，文件夹名称与文件名想同但默认包含后缀，例如：
-  * 上传文件 `20240820_123456.jpg` 后，默认在 `viking://resource/images/{this_date}/` 下创建文件夹 `20240820_123456_jpg` 用于存储该文件的相关内容
-  * 该文件夹内默认包含一个 `.abstract.md` 文件，用于存储该文件的摘要信息
-    * 例如：图片文件的摘要可能是图片的文件名、内容描述、画面风格等，正常不应超过 200 token
-  * 该文件夹内默认包含一个 `.overview.md` 文件，用于存储该文件的概览内容，例如：
-    * 图片文件的概览内容除了包含 `.abstract.md` 中的内容，还可能包含图片的尺寸、画面风格、OCR 识别结果、场景和主体描述等
-    * 音频文件的概览内容可能包含音频的文件名、时长、语音或歌词识别结果，以及主要的章节对应的时间线等
-    * 视频文件的概览内容可能包含视频的文件名、时长、使用场景等，对于较大的视频，未来会对视频进行切分，可能会继续递归用子文件夹存储切分后的视频文件、音轨文件、关键画面的截图等，因此视频的处理逻辑预期较为复杂，可等待图片、音频实现后，参考文件夹或 zip 的递归处理形态进行处理。
- * 该文件夹内需要放置原始文件，保持原始文件名，例如 `20240820_123456.jpg`，但如果文件名包含空格字符，需要将其替换为下划线 `_`，因为 OpenViking URI 不允许包含空格字符。
+#### 存储组织策略
+
+多媒体文件的存储采用以下层级结构：
+
+```
+viking://resource/
+├── images/     # 图片文件
+│   └── 20240820/  # 上传日期（YYYYMMDD）
+│       └── 20240820_123456_jpg/  # 文件文件夹（文件名_扩展名）
+│           ├── .abstract.md    # L0 摘要
+│           ├── .overview.md    # L1 概览
+│           └── 20240820_123456.jpg  # 原始文件
+├── audio/      # 音频文件
+│   └── 20240820/
+│       └── my_song_mp3/
+│           ├── .abstract.md
+│           ├── .overview.md
+│           └── my_song.mp3
+└── video/      # 视频文件
+    └── 20240820/
+        └── my_video_mp4/
+            ├── .abstract.md
+            ├── .overview.md
+            └── my_video.mp4
+```
+
+详细说明：
+
+1. **媒体子目录**: 在 `viking://resource` 下按类型划分为三个子目录
+   - `viking://resource/images`: 存储未明确指定目标路径的图片文件
+   - `viking://resource/audio`: 存储未明确指定目标路径的音频文件
+   - `viking://resource/video`: 存储未明确指定目标路径的视频文件
+
+2. **日期子目录**: 每次上传的文件按当前日期（格式：YYYYMMDD）组织，而非文件内部元信息的时间
+   - 例如：`viking://resource/images/20240820/` 存储 2024年8月20日上传的所有图片
+
+3. **文件文件夹**: 为每个多媒体文件创建一个专属文件夹，命名规则为：`文件名_扩展名`（扩展名不含点）
+   - 示例：上传 `20240820_123456.jpg` → 创建文件夹 `20240820_123456_jpg`
+
+4. **文件夹内容**: 每个文件文件夹内包含：
+   - **原始文件**: 保持原始文件名，空格字符替换为下划线 `_`（因为 OpenViking URI 不允许包含空格）
+     - 示例：`photo 1.jpg` → 保存为 `photo_1.jpg`
+   - `.abstract.md` (L0 层): 摘要信息（<200 token）
+     - 图片：文件名、内容描述、画面风格等
+     - 音频：文件名、时长、内容概述等
+     - 视频：文件名、时长、内容概述等
+   - `.overview.md` (L1 层): 概览信息
+     - 图片：除摘要外，还包含尺寸、OCR 识别结果、场景和主体描述等
+     - 音频：除摘要外，还包含语音/歌词识别结果、章节时间线等
+     - 视频：除摘要外，还包含使用场景等（未来会支持切分视频，递归存储子文件）
 
 
 ## 核心组件

From 3e3483b9f3321c29846501627d326e27e166a16a Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 17 Feb 2026 23:31:43 +0800
Subject: [PATCH 10/18] feat: vlm optimization for image

---
 examples/chatmem/ov.conf.example              |  3 +-
 examples/mcp-query/ov.conf.example            |  3 +-
 examples/memex/ov.conf.example                |  3 +-
 examples/ov.conf.example                      |  3 +-
 examples/query/ov.conf.example                |  3 +-
 examples/server_client/ov.conf.example        |  3 +-
 openviking/models/vlm/backends/litellm_vlm.py | 26 +++++++++++++--
 openviking/models/vlm/backends/openai_vlm.py  | 32 +++++++++++++------
 .../models/vlm/backends/volcengine_vlm.py     | 16 ++++++----
 openviking/models/vlm/base.py                 | 12 +++++--
 openviking/models/vlm/llm.py                  | 22 +++++++++----
 openviking_cli/utils/config/vlm_config.py     | 28 +++++++++-------
 12 files changed, 109 insertions(+), 45 deletions(-)

diff --git a/examples/chatmem/ov.conf.example b/examples/chatmem/ov.conf.example
index 2e9a40ae..6a085e5d 100644
--- a/examples/chatmem/ov.conf.example
+++ b/examples/chatmem/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3",
     "api_key"  : "not_gonna_give_u_this",
     "backend"  : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/mcp-query/ov.conf.example b/examples/mcp-query/ov.conf.example
index bf4a45de..fc40ea92 100644
--- a/examples/mcp-query/ov.conf.example
+++ b/examples/mcp-query/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3",
     "api_key"  : "<your-api-key>",
     "provider" : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/memex/ov.conf.example b/examples/memex/ov.conf.example
index d5187cd3..42697b72 100644
--- a/examples/memex/ov.conf.example
+++ b/examples/memex/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark.cn-beijing.volces.com/api/v3",
     "api_key"  : "your-volcengine-api-key",
     "backend"  : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/ov.conf.example b/examples/ov.conf.example
index 205cd7d9..34cbc6a4 100644
--- a/examples/ov.conf.example
+++ b/examples/ov.conf.example
@@ -45,7 +45,8 @@
     "api_base": "https://ark.cn-beijing.volces.com/api/v3",
     "temperature": 0.0,
     "max_retries": 2,
-    "provider": "volcengine"
+    "provider": "volcengine",
+    "thinking": false
   },
   "rerank": {
     "ak": null,
diff --git a/examples/query/ov.conf.example b/examples/query/ov.conf.example
index 58d034c0..fdc7cb55 100644
--- a/examples/query/ov.conf.example
+++ b/examples/query/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3",
     "api_key"  : "not_gonna_give_u_this",
     "provider" : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/server_client/ov.conf.example b/examples/server_client/ov.conf.example
index 582d79b8..13eb55db 100644
--- a/examples/server_client/ov.conf.example
+++ b/examples/server_client/ov.conf.example
@@ -34,6 +34,7 @@
     "api_base": "https://ark.cn-beijing.volces.com/api/v3",
     "temperature": 0.0,
     "max_retries": 2,
-    "provider": "volcengine"
+    "provider": "volcengine",
+    "thinking": false
   }
 }
diff --git a/openviking/models/vlm/backends/litellm_vlm.py b/openviking/models/vlm/backends/litellm_vlm.py
index f1efa562..2373e5dd 100644
--- a/openviking/models/vlm/backends/litellm_vlm.py
+++ b/openviking/models/vlm/backends/litellm_vlm.py
@@ -139,21 +139,31 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]:
 
         return kwargs
 
-    def get_completion(self, prompt: str) -> str:
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get text completion synchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
         messages = [{"role": "user", "content": prompt}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         response = completion(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
 
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
         """Get text completion asynchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
         messages = [{"role": "user", "content": prompt}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         last_error = None
         for attempt in range(max_retries + 1):
@@ -164,7 +174,7 @@ async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
             except Exception as e:
                 last_error = e
                 if attempt < max_retries:
-                    await asyncio.sleep(2 ** attempt)
+                    await asyncio.sleep(2**attempt)
 
         if last_error:
             raise last_error
@@ -174,6 +184,7 @@ def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion synchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
@@ -184,7 +195,11 @@ def get_vision_completion(
         content.append({"type": "text", "text": prompt})
 
         messages = [{"role": "user", "content": content}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         response = completion(**kwargs)
         self._update_token_usage_from_response(response)
@@ -194,6 +209,7 @@ async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion asynchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
@@ -204,7 +220,11 @@ async def get_vision_completion_async(
         content.append({"type": "text", "text": prompt})
 
         messages = [{"role": "user", "content": content}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         response = await acompletion(**kwargs)
         self._update_token_usage_from_response(response)
diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py
index c6c5b230..18a22ff7 100644
--- a/openviking/models/vlm/backends/openai_vlm.py
+++ b/openviking/models/vlm/backends/openai_vlm.py
@@ -131,6 +131,7 @@ def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion"""
         client = self.get_client()
@@ -140,11 +141,16 @@ def get_vision_completion(
             content.append(self._prepare_image(img))
         content.append({"type": "text", "text": prompt})
 
-        response = client.chat.completions.create(
-            model=self.model or "gpt-4o-mini",
-            messages=[{"role": "user", "content": content}],
-            temperature=self.temperature,
-        )
+        kwargs = {
+            "model": self.model or "gpt-4o-mini",
+            "messages": [{"role": "user", "content": content}],
+            "temperature": self.temperature,
+        }
+
+        if self.provider == "volcengine":
+            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
+
+        response = client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
 
@@ -152,6 +158,7 @@ async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion asynchronously"""
         client = self.get_async_client()
@@ -161,10 +168,15 @@ async def get_vision_completion_async(
             content.append(self._prepare_image(img))
         content.append({"type": "text", "text": prompt})
 
-        response = await client.chat.completions.create(
-            model=self.model or "gpt-4o-mini",
-            messages=[{"role": "user", "content": content}],
-            temperature=self.temperature,
-        )
+        kwargs = {
+            "model": self.model or "gpt-4o-mini",
+            "messages": [{"role": "user", "content": content}],
+            "temperature": self.temperature,
+        }
+
+        if self.provider == "volcengine":
+            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
+
+        response = await client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py
index f11a289e..b5841cc8 100644
--- a/openviking/models/vlm/backends/volcengine_vlm.py
+++ b/openviking/models/vlm/backends/volcengine_vlm.py
@@ -54,22 +54,26 @@ def get_async_client(self):
             )
         return self._async_client
 
-    def get_completion(self, prompt: str) -> str:
-        return super().get_completion(prompt)
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
+        return super().get_completion(prompt, thinking)
 
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
-        return await super().get_completion_async(prompt, max_retries)
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
+        return await super().get_completion_async(prompt, thinking, max_retries)
 
     def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
-        return super().get_vision_completion(prompt, images)
+        return super().get_vision_completion(prompt, images, thinking)
 
     async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
-        return await super().get_vision_completion_async(prompt, images)
+        return await super().get_vision_completion_async(prompt, images, thinking)
diff --git a/openviking/models/vlm/base.py b/openviking/models/vlm/base.py
index ef55f712..cd563f9c 100644
--- a/openviking/models/vlm/base.py
+++ b/openviking/models/vlm/base.py
@@ -27,12 +27,14 @@ def __init__(self, config: Dict[str, Any]):
         self._token_tracker = TokenUsageTracker()
 
     @abstractmethod
-    def get_completion(self, prompt: str) -> str:
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get text completion"""
         pass
 
     @abstractmethod
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
         """Get text completion asynchronously"""
         pass
 
@@ -41,6 +43,7 @@ def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion"""
         pass
@@ -50,6 +53,7 @@ async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion asynchronously"""
         pass
@@ -128,16 +132,20 @@ def create(config: Dict[str, Any]) -> VLMBase:
         if not use_litellm:
             if provider == "openai":
                 from .backends.openai_vlm import OpenAIVLM
+
                 return OpenAIVLM(config)
             elif provider == "volcengine":
                 from .backends.volcengine_vlm import VolcEngineVLM
+
                 return VolcEngineVLM(config)
 
         from .backends.litellm_vlm import LiteLLMVLMProvider
+
         return LiteLLMVLMProvider(config)
 
     @staticmethod
     def get_available_providers() -> List[str]:
         """Get list of available providers"""
         from .registry import get_all_provider_names
+
         return get_all_provider_names()
diff --git a/openviking/models/vlm/llm.py b/openviking/models/vlm/llm.py
index 07c52179..6c8b9c56 100644
--- a/openviking/models/vlm/llm.py
+++ b/openviking/models/vlm/llm.py
@@ -168,34 +168,38 @@ def complete_json(
         self,
         prompt: str,
         schema: Optional[Dict[str, Any]] = None,
+        thinking: bool = False,
     ) -> Optional[Dict[str, Any]]:
         """Get JSON completion from VLM."""
         if schema:
             prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}"
 
-        response = self._get_vlm().get_completion(prompt)
+        response = self._get_vlm().get_completion(prompt, thinking)
         return parse_json_from_response(response)
 
     async def complete_json_async(
         self,
         prompt: str,
         schema: Optional[Dict[str, Any]] = None,
+        thinking: bool = False,
+        max_retries: int = 0,
     ) -> Optional[Dict[str, Any]]:
         """Async version of complete_json."""
         if schema:
             prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}"
 
-        response = await self._get_vlm().get_completion_async(prompt)
+        response = await self._get_vlm().get_completion_async(prompt, thinking, max_retries)
         return parse_json_from_response(response)
 
     def complete_model(
         self,
         prompt: str,
         model_class: Type[T],
+        thinking: bool = False,
     ) -> Optional[T]:
         """Get structured completion validated against a Pydantic model."""
         schema = model_class.model_json_schema()
-        response = self.complete_json(prompt, schema=schema)
+        response = self.complete_json(prompt, schema=schema, thinking=thinking)
         if response is None:
             return None
 
@@ -209,10 +213,14 @@ async def complete_model_async(
         self,
         prompt: str,
         model_class: Type[T],
+        thinking: bool = False,
+        max_retries: int = 0,
     ) -> Optional[T]:
         """Async version of complete_model."""
         schema = model_class.model_json_schema()
-        response = await self.complete_json_async(prompt, schema=schema)
+        response = await self.complete_json_async(
+            prompt, schema=schema, thinking=thinking, max_retries=max_retries
+        )
         if response is None:
             return None
 
@@ -226,14 +234,16 @@ def get_vision_completion(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Get vision completion."""
-        return self._get_vlm().get_vision_completion(prompt, images)
+        return self._get_vlm().get_vision_completion(prompt, images, thinking)
 
     async def get_vision_completion_async(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Async vision completion."""
-        return await self._get_vlm().get_vision_completion_async(prompt, images)
+        return await self._get_vlm().get_vision_completion_async(prompt, images, thinking)
diff --git a/openviking_cli/utils/config/vlm_config.py b/openviking_cli/utils/config/vlm_config.py
index ad1bea8f..411c7d76 100644
--- a/openviking_cli/utils/config/vlm_config.py
+++ b/openviking_cli/utils/config/vlm_config.py
@@ -15,17 +15,16 @@ class VLMConfig(BaseModel):
     max_retries: int = Field(default=2, description="Maximum retry attempts")
 
     provider: Optional[str] = Field(default=None, description="Provider type")
-    backend: Optional[str] = Field(default=None, description="Backend provider (Deprecated, use 'provider' instead)")
+    backend: Optional[str] = Field(
+        default=None, description="Backend provider (Deprecated, use 'provider' instead)"
+    )
 
     providers: Dict[str, Dict[str, Any]] = Field(
         default_factory=dict,
-        description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}"
+        description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}",
     )
 
-    default_provider: Optional[str] = Field(
-        default=None,
-        description="Default provider name"
-    )
+    default_provider: Optional[str] = Field(default=None, description="Default provider name")
 
     thinking: bool = Field(default=False, description="Enable thinking mode for VolcEngine models")
 
@@ -141,6 +140,7 @@ def get_vlm_instance(self) -> Any:
         if self._vlm_instance is None:
             config_dict = self._build_vlm_config_dict()
             from openviking.models.vlm import VLMFactory
+
             self._vlm_instance = VLMFactory.create(config_dict)
         return self._vlm_instance
 
@@ -166,13 +166,15 @@ def _build_vlm_config_dict(self) -> Dict[str, Any]:
 
         return result
 
-    def get_completion(self, prompt: str) -> str:
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get LLM completion."""
-        return self.get_vlm_instance().get_completion(prompt)
+        return self.get_vlm_instance().get_completion(prompt, thinking)
 
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
         """Get LLM completion asynchronously, max_retries=0 means no retry."""
-        return await self.get_vlm_instance().get_completion_async(prompt, max_retries)
+        return await self.get_vlm_instance().get_completion_async(prompt, thinking, max_retries)
 
     def is_available(self) -> bool:
         """Check if LLM is configured."""
@@ -182,14 +184,16 @@ def get_vision_completion(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Get LLM completion with images."""
-        return self.get_vlm_instance().get_vision_completion(prompt, images)
+        return self.get_vlm_instance().get_vision_completion(prompt, images, thinking)
 
     async def get_vision_completion_async(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Get LLM completion with images asynchronously."""
-        return await self.get_vlm_instance().get_vision_completion_async(prompt, images)
+        return await self.get_vlm_instance().get_vision_completion_async(prompt, images, thinking)

From 5b818f5d888ec2df9bd77479c5b930055c08a4f6 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Thu, 19 Feb 2026 17:07:25 +0800
Subject: [PATCH 11/18] feat: vlm optimization for image

---
 openviking/models/vlm/backends/openai_vlm.py  |  12 --
 .../models/vlm/backends/volcengine_vlm.py     | 107 +++++++++++++++++-
 openviking/parse/directory_scan.py            |   1 +
 openviking/parse/parsers/README.md            |   6 +-
 openviking/parse/parsers/directory.py         |  67 +++++++++--
 openviking/parse/parsers/media/audio.py       |   3 +-
 openviking/parse/parsers/media/constants.py   |  15 +++
 openviking/parse/parsers/media/image.py       |  54 +++++++--
 openviking/parse/parsers/media/video.py       |   3 +-
 openviking/parse/registry.py                  |  67 +++++------
 .../templates/parsing/image_summary.yaml      |  31 +++++
 .../storage/queuefs/semantic_processor.py     |  20 +++-
 openviking_cli/utils/config/parser_config.py  |   2 +-
 tests/parse/test_add_directory.py             | 101 +++++++++++++++++
 14 files changed, 408 insertions(+), 81 deletions(-)
 create mode 100644 openviking/parse/parsers/media/constants.py
 create mode 100644 openviking/prompts/templates/parsing/image_summary.yaml

diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py
index 18a22ff7..d6f6effa 100644
--- a/openviking/models/vlm/backends/openai_vlm.py
+++ b/openviking/models/vlm/backends/openai_vlm.py
@@ -61,9 +61,6 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str:
             "temperature": self.temperature,
         }
 
-        if self.provider == "volcengine":
-            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
-
         response = client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
@@ -79,9 +76,6 @@ async def get_completion_async(
             "temperature": self.temperature,
         }
 
-        if self.provider == "volcengine":
-            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
-
         last_error = None
         for attempt in range(max_retries + 1):
             try:
@@ -147,9 +141,6 @@ def get_vision_completion(
             "temperature": self.temperature,
         }
 
-        if self.provider == "volcengine":
-            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
-
         response = client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
@@ -174,9 +165,6 @@ async def get_vision_completion_async(
             "temperature": self.temperature,
         }
 
-        if self.provider == "volcengine":
-            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
-
         response = await client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py
index b5841cc8..e4c4d290 100644
--- a/openviking/models/vlm/backends/volcengine_vlm.py
+++ b/openviking/models/vlm/backends/volcengine_vlm.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """VolcEngine VLM backend implementation"""
 
+import asyncio
+import base64
 from pathlib import Path
 from typing import Any, Dict, List, Union
 
@@ -55,12 +57,75 @@ def get_async_client(self):
         return self._async_client
 
     def get_completion(self, prompt: str, thinking: bool = False) -> str:
-        return super().get_completion(prompt, thinking)
+        """Get text completion"""
+        client = self.get_client()
+        kwargs = {
+            "model": self.model or "doubao-seed-1-8-251228",
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+
+        response = client.chat.completions.create(**kwargs)
+        self._update_token_usage_from_response(response)
+        return response.choices[0].message.content or ""
 
     async def get_completion_async(
         self, prompt: str, thinking: bool = False, max_retries: int = 0
     ) -> str:
-        return await super().get_completion_async(prompt, thinking, max_retries)
+        """Get text completion asynchronously"""
+        client = self.get_async_client()
+        kwargs = {
+            "model": self.model or "doubao-seed-1-8-251228",
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+
+        last_error = None
+        for attempt in range(max_retries + 1):
+            try:
+                response = await client.chat.completions.create(**kwargs)
+                self._update_token_usage_from_response(response)
+                return response.choices[0].message.content or ""
+            except Exception as e:
+                last_error = e
+                if attempt < max_retries:
+                    await asyncio.sleep(2**attempt)
+
+        if last_error:
+            raise last_error
+        else:
+            raise RuntimeError("Unknown error in async completion")
+
+    def _prepare_image(self, image: Union[str, Path, bytes]) -> Dict[str, Any]:
+        """Prepare image data"""
+        if isinstance(image, bytes):
+            b64 = base64.b64encode(image).decode("utf-8")
+            return {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64}"},
+            }
+        elif isinstance(image, Path) or (
+            isinstance(image, str) and not image.startswith(("http://", "https://"))
+        ):
+            path = Path(image)
+            suffix = path.suffix.lower()
+            mime_type = {
+                ".png": "image/png",
+                ".jpg": "image/jpeg",
+                ".jpeg": "image/jpeg",
+                ".gif": "image/gif",
+                ".webp": "image/webp",
+            }.get(suffix, "image/png")
+            with open(path, "rb") as f:
+                b64 = base64.b64encode(f.read()).decode("utf-8")
+            return {
+                "type": "image_url",
+                "image_url": {"url": f"data:{mime_type};base64,{b64}"},
+            }
+        else:
+            return {"type": "image_url", "image_url": {"url": image}}
 
     def get_vision_completion(
         self,
@@ -68,7 +133,24 @@ def get_vision_completion(
         images: List[Union[str, Path, bytes]],
         thinking: bool = False,
     ) -> str:
-        return super().get_vision_completion(prompt, images, thinking)
+        """Get vision completion"""
+        client = self.get_client()
+
+        content = []
+        for img in images:
+            content.append(self._prepare_image(img))
+        content.append({"type": "text", "text": prompt})
+
+        kwargs = {
+            "model": self.model or "doubao-seed-1-8-251228",
+            "messages": [{"role": "user", "content": content}],
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+
+        response = client.chat.completions.create(**kwargs)
+        self._update_token_usage_from_response(response)
+        return response.choices[0].message.content or ""
 
     async def get_vision_completion_async(
         self,
@@ -76,4 +158,21 @@ async def get_vision_completion_async(
         images: List[Union[str, Path, bytes]],
         thinking: bool = False,
     ) -> str:
-        return await super().get_vision_completion_async(prompt, images, thinking)
+        """Get vision completion asynchronously"""
+        client = self.get_async_client()
+
+        content = []
+        for img in images:
+            content.append(self._prepare_image(img))
+        content.append({"type": "text", "text": prompt})
+
+        kwargs = {
+            "model": self.model or "doubao-seed-1-8-251228",
+            "messages": [{"role": "user", "content": content}],
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+
+        response = await client.chat.completions.create(**kwargs)
+        self._update_token_usage_from_response(response)
+        return response.choices[0].message.content or ""
diff --git a/openviking/parse/directory_scan.py b/openviking/parse/directory_scan.py
index 2b62b6c1..8da532f2 100644
--- a/openviking/parse/directory_scan.py
+++ b/openviking/parse/directory_scan.py
@@ -164,6 +164,7 @@ def _classify_file(
 
     Processable: ParserRegistry has a parser, or is_text_file (code/config/docs).
     """
+    # Normal classification logic
     if registry.get_parser_for_file(file_path) is not None:
         return CLASS_PROCESSABLE
     if is_text_file(file_path):
diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md
index 94fcc23c..ceffac29 100644
--- a/openviking/parse/parsers/README.md
+++ b/openviking/parse/parsers/README.md
@@ -143,14 +143,16 @@ L1: """
 
 代码解析器，支持语法高亮和代码结构分析。能识别函数、类、方法等代码元素。
 
-### 6. MediaParser (`media.py`)
+### 6. MediaParser (`media/*.py`)
 
 **支持格式**:
 - 图片: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`
 - 视频: `.mp4`, `.mov`, `.avi`, `.webm`
 - 音频: `.mp3`, `.wav`, `.m4a`, `.flac`
 
-多媒体解析器，使用 VLM（视觉语言模型）分析图像、视频和音频内容，生成文本描述。
+多媒体解析器，使用 VLM（视觉语言模型）分析图像、视频和音频内容，生成文本描述。多媒体解析器当且仅当 add-resource 调用时只添加上述文件类型时生效。即：
+1. 当添加目录时，系统将对多媒体文件暂不生成单独目录和文本描述，仅存储和进行递归摘要。
+2. 当单独添加多媒体文件时，多媒体解析器会直接解析该文件，然后通过单独目录存放，在目录下生成文本描述。
 
 #### 存储组织策略
 
diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py
index 5f2f05fa..6fc6d8a6 100644
--- a/openviking/parse/parsers/directory.py
+++ b/openviking/parse/parsers/directory.py
@@ -27,6 +27,7 @@
     create_parse_result,
 )
 from openviking.parse.parsers.base_parser import BaseParser
+from openviking.parse.parsers.media.constants import MEDIA_EXTENSIONS
 from openviking_cli.utils.logger import get_logger
 
 if TYPE_CHECKING:
@@ -74,7 +75,8 @@ async def parse(
             source: Path to the directory.
             instruction: Processing instruction (forwarded where applicable).
             **kwargs: Extra options forwarded to ``scan_directory``:
-                ``strict``, ``ignore_dirs``, ``include``, ``exclude``.
+                ``strict``, ``ignore_dirs``, ``include``, ``exclude``,
+                ``directly_upload_media``.
 
         Returns:
             ``ParseResult`` with ``temp_dir_path`` pointing to VikingFS temp.
@@ -103,6 +105,7 @@ async def parse(
                 include=kwargs.get("include"),
                 exclude=kwargs.get("exclude"),
             )
+            directly_upload_media = kwargs.get("directly_upload_media", True)
             processable_files = scan_result.all_processable_files()
             warnings.extend(scan_result.warnings)
 
@@ -137,13 +140,35 @@ async def parse(
             for cf in processable_files:
                 file_parser = self._assign_parser(cf, registry)
                 parser_name = type(file_parser).__name__ if file_parser else "direct"
-                ok = await self._process_single_file(
-                    cf,
-                    file_parser,
-                    target_uri,
-                    viking_fs,
-                    warnings,
-                )
+
+                # Check if this is a media parser and we should directly upload
+                is_media_parser = file_parser and parser_name in [
+                    "ImageParser",
+                    "AudioParser",
+                    "VideoParser",
+                ]
+                ext = Path(cf.path).suffix.lower()
+                is_media_file = ext in MEDIA_EXTENSIONS
+
+                if directly_upload_media and is_media_parser and is_media_file:
+                    # Directly upload media file without using media parser
+                    ok = await self._upload_file_directly(
+                        cf,
+                        target_uri,
+                        viking_fs,
+                        warnings,
+                    )
+                    parser_name = "direct_upload"
+                else:
+                    # Normal processing with parser
+                    ok = await self._process_single_file(
+                        cf,
+                        file_parser,
+                        target_uri,
+                        viking_fs,
+                        warnings,
+                    )
+
                 if ok:
                     file_count += 1
                     processed_files.append(
@@ -332,6 +357,32 @@ async def _process_single_file(
                 warnings.append(f"Failed to upload {rel_path}: {exc}")
                 return False
 
+    @staticmethod
+    async def _upload_file_directly(
+        classified_file: "ClassifiedFile",
+        target_uri: str,
+        viking_fs: Any,
+        warnings: List[str],
+    ) -> bool:
+        """Directly upload a file without using its parser.
+
+        Used for media files when directly_upload_media=True.
+
+        Returns:
+            *True* on success, *False* on failure.
+        """
+        rel_path = classified_file.rel_path
+        src_file = classified_file.path
+
+        try:
+            content = src_file.read_bytes()
+            dst_uri = f"{target_uri}/{rel_path}"
+            await viking_fs.write_file(dst_uri, content)
+            return True
+        except Exception as exc:
+            warnings.append(f"Failed to upload {rel_path}: {exc}")
+            return False
+
     # ------------------------------------------------------------------
     # VikingFS merge helpers
     # ------------------------------------------------------------------
diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
index f0a018c3..e9473658 100644
--- a/openviking/parse/parsers/media/audio.py
+++ b/openviking/parse/parsers/media/audio.py
@@ -29,6 +29,7 @@
 
 from openviking.parse.base import NodeType, ParseResult, ResourceNode
 from openviking.parse.parsers.base_parser import BaseParser
+from openviking.parse.parsers.media.constants import AUDIO_EXTENSIONS
 from openviking_cli.utils.config.parser_config import AudioConfig
 
 
@@ -50,7 +51,7 @@ def __init__(self, config: Optional[AudioConfig] = None, **kwargs):
     @property
     def supported_extensions(self) -> List[str]:
         """Return supported audio file extensions."""
-        return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
+        return AUDIO_EXTENSIONS
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
diff --git a/openviking/parse/parsers/media/constants.py b/openviking/parse/parsers/media/constants.py
new file mode 100644
index 00000000..f8bc96cf
--- /dev/null
+++ b/openviking/parse/parsers/media/constants.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Constants for media parsers."""
+
+# Image extensions supported by ImageParser
+IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
+
+# Audio extensions supported by AudioParser
+AUDIO_EXTENSIONS = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
+
+# Video extensions supported by VideoParser
+VIDEO_EXTENSIONS = [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"]
+
+# All media extensions combined
+MEDIA_EXTENSIONS = set(IMAGE_EXTENSIONS + AUDIO_EXTENSIONS + VIDEO_EXTENSIONS)
diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py
index c82b9589..059735d1 100644
--- a/openviking/parse/parsers/media/image.py
+++ b/openviking/parse/parsers/media/image.py
@@ -17,7 +17,12 @@
 
 from openviking.parse.base import NodeType, ParseResult, ResourceNode
 from openviking.parse.parsers.base_parser import BaseParser
+from openviking.parse.parsers.media.constants import IMAGE_EXTENSIONS
+from openviking.storage.viking_fs import get_viking_fs
+from openviking_cli.utils.config import get_openviking_config
 from openviking_cli.utils.config.parser_config import ImageConfig
+from openviking_cli.utils.logger import get_logger
+from openviking_cli.utils.uri import VikingURI
 
 # =============================================================================
 # Configuration Classes
@@ -62,7 +67,7 @@ def __init__(self, config: Optional[ImageConfig] = None, **kwargs):
     @property
     def supported_extensions(self) -> List[str]:
         """Return supported image file extensions."""
-        return [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
+        return IMAGE_EXTENSIONS
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
@@ -91,7 +96,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             FileNotFoundError: If source file does not exist
             IOError: If image processing fails
         """
-        from openviking.storage.viking_fs import get_viking_fs
 
         # Convert to Path object
         file_path = Path(source) if isinstance(source, str) else source
@@ -105,8 +109,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         image_bytes = file_path.read_bytes()
         ext = file_path.suffix
 
-        from openviking_cli.utils.uri import VikingURI
-
         # Sanitize original filename (replace spaces with underscores)
         original_filename = file_path.name.replace(" ", "_")
         # Root directory name: filename stem + _ + extension (without dot)
@@ -165,7 +167,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         )
 
         # Phase 2: Generate semantic info
-        await self._generate_semantic_info(root_node, description, viking_fs, ocr_text is not None)
+        await self._generate_semantic_info(
+            root_node, description, viking_fs, ocr_text is not None, root_dir_uri
+        )
 
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
@@ -187,11 +191,34 @@ async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str:
 
         Returns:
             Image description in markdown format
-
-        TODO: Integrate with actual VLM API (OpenAI GPT-4V, Claude Vision, etc.)
         """
-        # Fallback implementation - returns basic placeholder
-        return "Image description (VLM integration pending)\n\nThis is an image. VLM description feature has not yet integrated external API."
+        from openviking.prompts import render_prompt
+
+        logger = get_logger(__name__)
+
+        try:
+            vlm = get_openviking_config().vlm
+
+            # Render prompt
+            prompt = render_prompt(
+                "parsing.image_summary",
+                {
+                    "context": "No additional context",
+                },
+            )
+
+            # Call VLM
+            response = await vlm.get_vision_completion_async(
+                prompt=prompt,
+                images=[image_bytes],
+            )
+
+            return response.strip()
+
+        except Exception as e:
+            logger.error(f"Error in VLM image description: {e}")
+            # Fallback to basic description
+            return "Image description (VLM integration failed)\n\nThis is an image file."
 
     async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]:
         """
@@ -210,16 +237,17 @@ async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]:
         return None
 
     async def _generate_semantic_info(
-        self, node: ResourceNode, description: str, viking_fs, has_ocr: bool
+        self, node: ResourceNode, description: str, viking_fs, has_ocr: bool, root_dir_uri: str
     ):
         """
-        Phase 2: Generate abstract and overview.
+        Phase 2: Generate abstract and overview and write to .abstract.md and .overview.md.
 
         Args:
             node: ResourceNode to update
             description: Image description
             viking_fs: VikingFS instance
             has_ocr: Whether OCR file exists
+            root_dir_uri: Root directory URI to write semantic files
         """
         # Generate abstract (short summary, < 100 tokens)
         abstract = description[:200] if len(description) > 200 else description
@@ -265,6 +293,10 @@ async def _generate_semantic_info(
         node.meta["abstract"] = abstract
         node.meta["overview"] = overview
 
+        # Write to files in temp directory
+        await viking_fs.write_file(f"{root_dir_uri}/.abstract.md", abstract)
+        await viking_fs.write_file(f"{root_dir_uri}/.overview.md", overview)
+
     async def parse_content(
         self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
     ) -> ParseResult:
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
index 53cccf67..84ee468f 100644
--- a/openviking/parse/parsers/media/video.py
+++ b/openviking/parse/parsers/media/video.py
@@ -29,6 +29,7 @@
 
 from openviking.parse.base import NodeType, ParseResult, ResourceNode
 from openviking.parse.parsers.base_parser import BaseParser
+from openviking.parse.parsers.media.constants import VIDEO_EXTENSIONS
 from openviking_cli.utils.config.parser_config import VideoConfig
 
 
@@ -50,7 +51,7 @@ def __init__(self, config: Optional[VideoConfig] = None, **kwargs):
     @property
     def supported_extensions(self) -> List[str]:
         """Return supported video file extensions."""
-        return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"]
+        return VIDEO_EXTENSIONS
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py
index 378bcd7d..ac51fec9 100644
--- a/openviking/parse/registry.py
+++ b/openviking/parse/registry.py
@@ -8,16 +8,19 @@
 
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from openviking.parse.base import ParseResult
 from openviking.parse.parsers.base_parser import BaseParser
+from openviking.parse.parsers.code import CodeRepositoryParser
+from openviking.parse.parsers.directory import DirectoryParser
 from openviking.parse.parsers.epub import EPubParser
 from openviking.parse.parsers.excel import ExcelParser
 
 # Import will be handled dynamically to avoid dependency issues
 from openviking.parse.parsers.html import HTMLParser
 from openviking.parse.parsers.markdown import MarkdownParser
+from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser
 from openviking.parse.parsers.pdf import PDFParser
 from openviking.parse.parsers.powerpoint import PowerPointParser
 from openviking.parse.parsers.text import TextParser
@@ -25,6 +28,8 @@
 # Import markitdown-inspired parsers
 from openviking.parse.parsers.word import WordParser
 from openviking.parse.parsers.zip_parser import ZipParser
+from openviking_cli.utils.config import get_openviking_config
+from openviking_cli.utils.config.parser_config import load_parser_configs_from_dict
 
 if TYPE_CHECKING:
     from openviking.parse.custom import CustomParserProtocol
@@ -34,27 +39,35 @@
 
 class ParserRegistry:
     """
-    Registry for document parsers.
+    Registry for document parsers, which is a singleton.
 
     Automatically selects appropriate parser based on file extension.
     """
 
-    def __init__(self, register_optional: bool = True):
+    def __init__(
+        self, register_optional: bool = True, parser_configs: Optional[Dict[str, Any]] = None
+    ):
         """
         Initialize registry with default parsers.
 
         Args:
             register_optional: Whether to register optional parsers
                               that require extra dependencies
+            parser_configs: Dictionary of parser configurations (from load_parser_configs_from_dict)
         """
         self._parsers: Dict[str, BaseParser] = {}
         self._extension_map: Dict[str, str] = {}
 
+        # Get parser configs
+        self._parser_configs = parser_configs or {}
+        config = get_openviking_config()
+        self._parser_configs = load_parser_configs_from_dict(config.parsers)
+
         # Register core parsers
-        self.register("text", TextParser())
-        self.register("markdown", MarkdownParser())
-        self.register("pdf", PDFParser())
-        self.register("html", HTMLParser())
+        self.register("text", TextParser(config=self._parser_configs.get("text")))
+        self.register("markdown", MarkdownParser(config=self._parser_configs.get("markdown")))
+        self.register("pdf", PDFParser(config=self._parser_configs.get("pdf")))
+        self.register("html", HTMLParser())  # HTMLParser doesn't accept config yet
 
         # Register markitdown-inspired parsers (built-in)
         self.register("word", WordParser())
@@ -62,38 +75,12 @@ def __init__(self, register_optional: bool = True):
         self.register("excel", ExcelParser())
         self.register("epub", EPubParser())
         self.register("zip", ZipParser())
+        self.register("code", CodeRepositoryParser())
+        self.register("directory", DirectoryParser())
 
-        # Register code parser dynamically
-        try:
-            from openviking.parse.parsers.code import CodeRepositoryParser
-
-            self.register("code", CodeRepositoryParser())
-        except ImportError as e:
-            logger.warning(f"CodeRepositoryParser not available: {e}")
-
-        # Register directory parser
-        try:
-            from openviking.parse.parsers.directory import DirectoryParser
-
-            self.register("directory", DirectoryParser())
-        except ImportError as e:
-            logger.warning(f"DirectoryParser not available: {e}")
-
-        # Register optional media parsers
-        if register_optional:
-            try:
-                from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser
-
-                self.register("image", ImageParser())
-                logger.info("Registered ImageParser for image formats")
-
-                self.register("audio", AudioParser())
-                logger.info("Registered AudioParser for audio formats")
-
-                self.register("video", VideoParser())
-                logger.info("Registered VideoParser for video formats")
-            except ImportError as e:
-                logger.debug(f"Media parsers not registered: {e}")
+        self.register("image", ImageParser(config=self._parser_configs.get("image")))
+        self.register("audio", AudioParser(config=self._parser_configs.get("audio")))
+        self.register("video", VideoParser(config=self._parser_configs.get("video")))
 
     def register(self, name: str, parser: BaseParser) -> None:
         """
@@ -290,11 +277,11 @@ def list_supported_extensions(self) -> List[str]:
 _default_registry: Optional[ParserRegistry] = None
 
 
-def get_registry() -> ParserRegistry:
+def get_registry(parser_configs: Optional[Dict[str, Any]] = None) -> ParserRegistry:
     """Get the default parser registry."""
     global _default_registry
     if _default_registry is None:
-        _default_registry = ParserRegistry()
+        _default_registry = ParserRegistry(parser_configs=parser_configs)
     return _default_registry
 
 
diff --git a/openviking/prompts/templates/parsing/image_summary.yaml b/openviking/prompts/templates/parsing/image_summary.yaml
new file mode 100644
index 00000000..26bf3f0f
--- /dev/null
+++ b/openviking/prompts/templates/parsing/image_summary.yaml
@@ -0,0 +1,31 @@
+metadata:
+  id: "parsing.image_summary"
+  name: "Image Summary"
+  description: "Generate concise image summary for semantic parsing"
+  version: "1.0.0"
+  language: "en"
+  category: "parsing"
+
+variables:
+  - name: "context"
+    type: "string"
+    description: "Additional context for image understanding"
+    default: "No additional context"
+    required: false
+
+template: |
+  Please analyze this image and generate a concise summary for semantic indexing.
+
+  Context: {{ context }}
+
+  Generate a comprehensive description that includes:
+  1. What is in the image (main subjects, objects)
+  2. What is happening or what the image depicts
+  3. Any text visible in the image
+  4. Key visual elements and their relationships
+
+  Keep the description clear and detailed, suitable for semantic search and understanding.
+
+llm_config:
+  temperature: 0.0
+  supports_vision: true
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 38da777a..5904d826 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -12,7 +12,9 @@
     FILE_TYPE_CODE,
     FILE_TYPE_DOCUMENTATION,
     FILE_TYPE_OTHER,
+    IGNORE_EXTENSIONS,
 )
+from openviking.parse.parsers.upload_utils import is_text_file
 from openviking.prompts import render_prompt
 from openviking.storage.queuefs.named_queue import DequeueHandlerBase
 from openviking.storage.queuefs.semantic_dag import DagStats, SemanticDagExecutor
@@ -289,10 +291,26 @@ async def _generate_single_file_summary(
         file_name = file_path.split("/")[-1]
 
         try:
+            # Check if this is a binary file that should be skipped
+            from pathlib import Path
+
+            p = Path(file_name)
+            extension = p.suffix.lower()
+
+            # Skip binary files (using IGNORE_EXTENSIONS as reference)
+            if extension in IGNORE_EXTENSIONS or not is_text_file(file_name):
+                logger.debug(f"Skipping binary file for summary generation: {file_path}")
+                return {"name": file_name, "summary": ""}
+
             # Read file content (limit length)
             content = await viking_fs.read_file(file_path)
             if isinstance(content, bytes):
-                content = content.decode("utf-8")
+                # Try to decode with error handling for text files
+                try:
+                    content = content.decode("utf-8")
+                except UnicodeDecodeError:
+                    logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}")
+                    return {"name": file_name, "summary": ""}
 
             # Limit content length (about 10000 tokens)
             max_chars = 30000
diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py
index 14931b6e..079daf53 100644
--- a/openviking_cli/utils/config/parser_config.py
+++ b/openviking_cli/utils/config/parser_config.py
@@ -229,7 +229,7 @@ class ImageConfig(ParserConfig):
     Configuration for image parsing.
 
     Attributes:
-        enable_ocr: Whether to perform OCR text extraction
+        enable_ocr: Whether to perform OCR text extraction, not implemented
         enable_vlm: Whether to use VLM for visual understanding
         ocr_lang: Language for OCR (e.g., "chi_sim", "eng")
         vlm_model: VLM model to use (e.g., "gpt-4-vision")
diff --git a/tests/parse/test_add_directory.py b/tests/parse/test_add_directory.py
index 0402adf3..81fd650b 100644
--- a/tests/parse/test_add_directory.py
+++ b/tests/parse/test_add_directory.py
@@ -180,6 +180,18 @@ def tmp_mixed(tmp_path: Path) -> Path:
     return tmp_path
 
 
+@pytest.fixture
+def tmp_media_files(tmp_path: Path) -> Path:
+    """Directory with various media files and regular files."""
+    (tmp_path / "docs.md").write_text("# Documentation", encoding="utf-8")
+    (tmp_path / "image.png").write_bytes(b"\x89PNG\r\n\x1a\n")
+    (tmp_path / "photo.jpg").write_bytes(b"\xff\xd8\xff")
+    (tmp_path / "audio.mp3").write_bytes(b"ID3")
+    (tmp_path / "video.mp4").write_bytes(b"\x00\x00\x00\x18ftyp")
+    (tmp_path / "script.js").write_text("console.log('test')", encoding="utf-8")
+    return tmp_path
+
+
 # ---------------------------------------------------------------------------
 # Tests: basic properties
 # ---------------------------------------------------------------------------
@@ -461,3 +473,92 @@ async def test_result_fields(self, tmp_code: Path, parser, fake_fs) -> None:
         assert result.meta["dir_name"] == tmp_code.name
         assert result.meta["total_processable"] == 3
         assert result.meta["file_count"] == 3
+
+
+# ---------------------------------------------------------------------------
+# Tests: directly_upload_media parameter
+# ---------------------------------------------------------------------------
+
+
+class TestDirectlyUploadMedia:
+    """Test the directly_upload_media parameter behavior."""
+
+    @pytest.mark.asyncio
+    async def test_default_directly_upload_media_true(self, tmp_media_files: Path, fake_fs) -> None:
+        """Test that with directly_upload_media=True (default), media files are uploaded directly."""
+        with patch.object(BaseParser, "_get_viking_fs", return_value=fake_fs):
+            parser = DirectoryParser()
+            await parser.parse(str(tmp_media_files))
+
+        uploaded_names = {uri.split("/")[-1] for uri in fake_fs.files}
+
+        assert "docs.md" in uploaded_names
+        assert "image.png" in uploaded_names
+        assert "photo.jpg" in uploaded_names
+        assert "audio.mp3" in uploaded_names
+        assert "video.mp4" in uploaded_names
+        assert "script.js" in uploaded_names
+
+    @pytest.mark.asyncio
+    async def test_directly_upload_media_false(self, tmp_media_files: Path, fake_fs) -> None:
+        """Test that with directly_upload_media=False, media files go through their parsers."""
+        mock_image_result = create_parse_result(
+            root=ResourceNode(type=NodeType.ROOT),
+            source_path=str(tmp_media_files / "image.png"),
+            source_format="image",
+            parser_name="ImageParser",
+            parse_time=0.1,
+        )
+        mock_image_result.temp_dir_path = fake_fs.create_temp_uri()
+
+        mock_audio_result = create_parse_result(
+            root=ResourceNode(type=NodeType.ROOT),
+            source_path=str(tmp_media_files / "audio.mp3"),
+            source_format="audio",
+            parser_name="AudioParser",
+            parse_time=0.1,
+        )
+        mock_audio_result.temp_dir_path = fake_fs.create_temp_uri()
+
+        mock_video_result = create_parse_result(
+            root=ResourceNode(type=NodeType.ROOT),
+            source_path=str(tmp_media_files / "video.mp4"),
+            source_format="video",
+            parser_name="VideoParser",
+            parse_time=0.1,
+        )
+        mock_video_result.temp_dir_path = fake_fs.create_temp_uri()
+
+        with patch.object(BaseParser, "_get_viking_fs", return_value=fake_fs):
+            parser = DirectoryParser()
+
+            with patch.object(parser, "_assign_parser") as mock_assign:
+                from openviking.parse.parsers.media.audio import AudioParser
+                from openviking.parse.parsers.media.image import ImageParser
+                from openviking.parse.parsers.media.video import VideoParser
+
+                mock_image = AsyncMock(spec=ImageParser)
+                mock_image.parse = AsyncMock(return_value=mock_image_result)
+
+                mock_audio = AsyncMock(spec=AudioParser)
+                mock_audio.parse = AsyncMock(return_value=mock_audio_result)
+
+                mock_video = AsyncMock(spec=VideoParser)
+                mock_video.parse = AsyncMock(return_value=mock_video_result)
+
+                def assign_side_effect(cf, registry):
+                    if cf.path.suffix in {".png", ".jpg"}:
+                        return mock_image
+                    elif cf.path.suffix in {".mp3"}:
+                        return mock_audio
+                    elif cf.path.suffix in {".mp4"}:
+                        return mock_video
+                    return registry.get_parser_for_file(cf.path)
+
+                mock_assign.side_effect = assign_side_effect
+
+                await parser.parse(str(tmp_media_files), directly_upload_media=False)
+
+        assert mock_image.parse.call_count == 2
+        mock_audio.parse.assert_called_once()
+        mock_video.parse.assert_called_once()

From 0974f3b780341662699707a8e65a7c19195833ca Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Thu, 19 Feb 2026 17:16:16 +0800
Subject: [PATCH 12/18] feat: vlm optimization for image

---
 openviking/parse/registry.py | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py
index ac51fec9..5777edec 100644
--- a/openviking/parse/registry.py
+++ b/openviking/parse/registry.py
@@ -8,7 +8,7 @@
 
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 from openviking.parse.base import ParseResult
 from openviking.parse.parsers.base_parser import BaseParser
@@ -28,8 +28,6 @@
 # Import markitdown-inspired parsers
 from openviking.parse.parsers.word import WordParser
 from openviking.parse.parsers.zip_parser import ZipParser
-from openviking_cli.utils.config import get_openviking_config
-from openviking_cli.utils.config.parser_config import load_parser_configs_from_dict
 
 if TYPE_CHECKING:
     from openviking.parse.custom import CustomParserProtocol
@@ -44,9 +42,7 @@ class ParserRegistry:
     Automatically selects appropriate parser based on file extension.
     """
 
-    def __init__(
-        self, register_optional: bool = True, parser_configs: Optional[Dict[str, Any]] = None
-    ):
+    def __init__(self, register_optional: bool = True):
         """
         Initialize registry with default parsers.
 
@@ -58,15 +54,10 @@ def __init__(
         self._parsers: Dict[str, BaseParser] = {}
         self._extension_map: Dict[str, str] = {}
 
-        # Get parser configs
-        self._parser_configs = parser_configs or {}
-        config = get_openviking_config()
-        self._parser_configs = load_parser_configs_from_dict(config.parsers)
-
         # Register core parsers
-        self.register("text", TextParser(config=self._parser_configs.get("text")))
-        self.register("markdown", MarkdownParser(config=self._parser_configs.get("markdown")))
-        self.register("pdf", PDFParser(config=self._parser_configs.get("pdf")))
+        self.register("text", TextParser())
+        self.register("markdown", MarkdownParser())
+        self.register("pdf", PDFParser())
         self.register("html", HTMLParser())  # HTMLParser doesn't accept config yet
 
         # Register markitdown-inspired parsers (built-in)
@@ -78,9 +69,9 @@ def __init__(
         self.register("code", CodeRepositoryParser())
         self.register("directory", DirectoryParser())
 
-        self.register("image", ImageParser(config=self._parser_configs.get("image")))
-        self.register("audio", AudioParser(config=self._parser_configs.get("audio")))
-        self.register("video", VideoParser(config=self._parser_configs.get("video")))
+        self.register("image", ImageParser())
+        self.register("audio", AudioParser())
+        self.register("video", VideoParser())
 
     def register(self, name: str, parser: BaseParser) -> None:
         """
@@ -277,11 +268,11 @@ def list_supported_extensions(self) -> List[str]:
 _default_registry: Optional[ParserRegistry] = None
 
 
-def get_registry(parser_configs: Optional[Dict[str, Any]] = None) -> ParserRegistry:
+def get_registry() -> ParserRegistry:
     """Get the default parser registry."""
     global _default_registry
     if _default_registry is None:
-        _default_registry = ParserRegistry(parser_configs=parser_configs)
+        _default_registry = ParserRegistry()
     return _default_registry
 
 

From 25d52b98fe02932059d021614c9751b49d82d276 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Fri, 20 Feb 2026 12:14:45 +0800
Subject: [PATCH 13/18] refactor: move media content understanding to
 SemanticProcessor

- Add parse/parsers/media/utils.py with media helpers
- Refactor ImageParser.parse(), AudioParser.parse(), VideoParser.parse() to remove content understanding, keep only metadata extraction
- Update SemanticProcessor._generate_single_file_summary() to handle media types and call media utils for summary generation
- Update TreeBuilder._get_base_uri() to use media utils
- Update ResourceNode.get_abstract() and get_overview() to check meta for abstract/overview
- Add debug logs and error handling
---
 openviking/parse/base.py                      |   8 +-
 openviking/parse/parsers/media/__init__.py    |   3 +-
 openviking/parse/parsers/media/audio.py       |  38 +----
 openviking/parse/parsers/media/image.py       |  59 ++-----
 openviking/parse/parsers/media/utils.py       | 145 ++++++++++++++++++
 openviking/parse/parsers/media/video.py       |  35 +----
 openviking/parse/tree_builder.py              |  77 +++-------
 .../storage/queuefs/semantic_processor.py     |  38 ++++-
 openviking/utils/resource_processor.py        |   1 +
 9 files changed, 232 insertions(+), 172 deletions(-)
 create mode 100644 openviking/parse/parsers/media/utils.py

diff --git a/openviking/parse/base.py b/openviking/parse/base.py
index 9548f4b4..91eec900 100644
--- a/openviking/parse/base.py
+++ b/openviking/parse/base.py
@@ -254,7 +254,7 @@ def get_text(self, include_children: bool = True) -> str:
                 texts.append(child.get_text(include_children=True))
         return "\n".join(texts)
 
-    def get_abstract(self, max_length: int = 200) -> str:
+    def get_abstract(self, max_length: int = 256) -> str:
         """
         Generate L0 abstract for this node.
 
@@ -264,6 +264,8 @@ def get_abstract(self, max_length: int = 200) -> str:
         Returns:
             Abstract text
         """
+        if "abstract" in self.meta:
+            return self.meta["abstract"]
         if self.title:
             abstract = self.title
         else:
@@ -285,8 +287,10 @@ def get_overview(self, max_length: int = 4000) -> str:
         Returns:
             Overview text including structure summary
         """
+        if "overview" in self.meta:
+            return self.meta["overview"]
+        # Default overview generation
         parts = []
-
         if self.title:
             parts.append(f"**{self.title}**")
 
diff --git a/openviking/parse/parsers/media/__init__.py b/openviking/parse/parsers/media/__init__.py
index 7fed46b5..9f58977f 100644
--- a/openviking/parse/parsers/media/__init__.py
+++ b/openviking/parse/parsers/media/__init__.py
@@ -3,6 +3,7 @@
 
 from .audio import AudioParser
 from .image import ImageParser
+from .utils import get_media_base_uri, get_media_type
 from .video import VideoParser
 
-__all__ = ["ImageParser", "AudioParser", "VideoParser"]
+__all__ = ["ImageParser", "AudioParser", "VideoParser", "get_media_type", "get_media_base_uri"]
diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
index e9473658..3caeb44a 100644
--- a/openviking/parse/parsers/media/audio.py
+++ b/openviking/parse/parsers/media/audio.py
@@ -55,19 +55,7 @@ def supported_extensions(self) -> List[str]:
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
-        Parse audio file using three-phase architecture.
-
-        Phase 1: Generate temporary files
-        - Copy original audio to temp_uri/content.{ext}
-        - (Optional) Generate transcript with timestamps
-
-        Phase 2: Generate semantic info
-        - Generate abstract and overview based on description
-        - Overview includes file list and usage instructions
-
-        Phase 3: Build directory structure
-        - Move all files to final URI
-        - Generate .abstract.md, .overview.md
+        Parse audio file - only copy original file and extract basic metadata, no content understanding.
 
         Args:
             source: Audio file path
@@ -140,24 +128,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         channels = 0
         format_str = ext[1:].upper()
 
-        # 1.3 Generate ASR description
-        description = ""
-        if self.config.enable_transcription:
-            description = await self._asr_transcribe(audio_bytes, self.config.asr_model)
-        else:
-            # Fallback: basic description
-            description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)"
-
-        # 1.4 Transcript with timestamps (optional)
-        transcript_text = None
-        if self.config.enable_transcription and self.config.enable_timestamps:
-            transcript_text = await self._asr_transcribe_with_timestamps(
-                audio_bytes, self.config.asr_model
-            )
-            if transcript_text:
-                await viking_fs.write_file(f"{root_dir_uri}/transcript.md", transcript_text)
-
-        # Create ResourceNode
+        # Create ResourceNode - metadata only, no content understanding yet
         root_node = ResourceNode(
             type=NodeType.ROOT,
             title=file_path.stem,
@@ -177,11 +148,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             },
         )
 
-        # Phase 2: Generate semantic info
-        await self._generate_semantic_info(
-            root_node, description, viking_fs, transcript_text is not None
-        )
-
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
             root=root_node,
diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py
index 059735d1..3442f26e 100644
--- a/openviking/parse/parsers/media/image.py
+++ b/openviking/parse/parsers/media/image.py
@@ -18,12 +18,15 @@
 from openviking.parse.base import NodeType, ParseResult, ResourceNode
 from openviking.parse.parsers.base_parser import BaseParser
 from openviking.parse.parsers.media.constants import IMAGE_EXTENSIONS
+from openviking.prompts import render_prompt
 from openviking.storage.viking_fs import get_viking_fs
 from openviking_cli.utils.config import get_openviking_config
 from openviking_cli.utils.config.parser_config import ImageConfig
 from openviking_cli.utils.logger import get_logger
 from openviking_cli.utils.uri import VikingURI
 
+logger = get_logger(__name__)
+
 # =============================================================================
 # Configuration Classes
 # =============================================================================
@@ -71,19 +74,7 @@ def supported_extensions(self) -> List[str]:
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
-        Parse image file using three-phase architecture.
-
-        Phase 1: Generate temporary files
-        - Copy original image to temp_uri/content.{ext}
-        - (Optional) Generate ocr.md using OCR
-
-        Phase 2: Generate semantic info
-        - Generate abstract and overview based on description
-        - Overview includes file list and usage instructions
-
-        Phase 3: Build directory structure
-        - Move all files to final URI
-        - Generate .abstract.md, .overview.md
+        Parse image file - only copy original file and extract basic metadata, no content understanding.
 
         Args:
             source: Image file path
@@ -96,7 +87,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             FileNotFoundError: If source file does not exist
             IOError: If image processing fails
         """
-
         # Convert to Path object
         file_path = Path(source) if isinstance(source, str) else source
         if not file_path.exists():
@@ -132,22 +122,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         except Exception as e:
             raise ValueError(f"Invalid image file: {file_path}. Error: {e}") from e
 
-        # 1.3 Generate VLM description
-        description = ""
-        if self.config.enable_vlm:
-            description = await self._vlm_describe(image_bytes, self.config.vlm_model)
-        else:
-            # Fallback: basic description
-            description = f"Image file: {file_path.name} ({format_str}, {width}x{height})"
-
-        # 1.4 OCR (optional)
-        ocr_text = None
-        if self.config.enable_ocr:
-            ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang)
-            if ocr_text:
-                await viking_fs.write_file(f"{root_dir_uri}/ocr.md", ocr_text)
-
-        # Create ResourceNode
+        # Create ResourceNode - metadata only, no content understanding yet
         root_node = ResourceNode(
             type=NodeType.ROOT,
             title=file_path.stem,
@@ -166,11 +141,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             },
         )
 
-        # Phase 2: Generate semantic info
-        await self._generate_semantic_info(
-            root_node, description, viking_fs, ocr_text is not None, root_dir_uri
-        )
-
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
             root=root_node,
@@ -192,10 +162,6 @@ async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str:
         Returns:
             Image description in markdown format
         """
-        from openviking.prompts import render_prompt
-
-        logger = get_logger(__name__)
-
         try:
             vlm = get_openviking_config().vlm
 
@@ -206,17 +172,20 @@ async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str:
                     "context": "No additional context",
                 },
             )
-
-            # Call VLM
             response = await vlm.get_vision_completion_async(
                 prompt=prompt,
                 images=[image_bytes],
             )
+            logger.info(
+                f"[ImageParser._vlm_describe] VLM response received, length: {len(response)}, content: {response[:256]}"
+            )
 
             return response.strip()
 
         except Exception as e:
-            logger.error(f"Error in VLM image description: {e}")
+            logger.error(
+                f"[ImageParser._vlm_describe] Error in VLM image description: {e}", exc_info=True
+            )
             # Fallback to basic description
             return "Image description (VLM integration failed)\n\nThis is an image file."
 
@@ -250,7 +219,7 @@ async def _generate_semantic_info(
             root_dir_uri: Root directory URI to write semantic files
         """
         # Generate abstract (short summary, < 100 tokens)
-        abstract = description[:200] if len(description) > 200 else description
+        abstract = description[:253] + "..." if len(description) > 256 else description
 
         # Generate overview (content summary + file list + usage instructions)
         overview_parts = [
@@ -294,8 +263,8 @@ async def _generate_semantic_info(
         node.meta["overview"] = overview
 
         # Write to files in temp directory
-        await viking_fs.write_file(f"{root_dir_uri}/.abstract.md", abstract)
-        await viking_fs.write_file(f"{root_dir_uri}/.overview.md", overview)
+        # await viking_fs.write_file(f"{root_dir_uri}/.abstract.md", abstract)
+        # await viking_fs.write_file(f"{root_dir_uri}/.overview.md", overview)
 
     async def parse_content(
         self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
diff --git a/openviking/parse/parsers/media/utils.py b/openviking/parse/parsers/media/utils.py
new file mode 100644
index 00000000..f4acbdf4
--- /dev/null
+++ b/openviking/parse/parsers/media/utils.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Media-related utilities for OpenViking."""
+
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from openviking.prompts import render_prompt
+from openviking.storage.viking_fs import get_viking_fs
+from openviking_cli.utils.config import get_openviking_config
+from openviking_cli.utils.logger import get_logger
+
+from .constants import AUDIO_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
+
+logger = get_logger(__name__)
+
+
+def get_media_type(source_path: Optional[str], source_format: Optional[str]) -> Optional[str]:
+    """
+    Determine media type from source path or format.
+
+    Args:
+        source_path: Source file path
+        source_format: Source format string (e.g., "image", "audio", "video")
+
+    Returns:
+        Media type ("image", "audio", "video") or None if not a media file
+    """
+    if source_format:
+        if source_format in ["image", "audio", "video"]:
+            return source_format
+
+    if source_path:
+        ext = Path(source_path).suffix.lower()
+        if ext in IMAGE_EXTENSIONS:
+            return "image"
+        elif ext in AUDIO_EXTENSIONS:
+            return "audio"
+        elif ext in VIDEO_EXTENSIONS:
+            return "video"
+
+    return None
+
+
+def get_media_base_uri(media_type: str) -> str:
+    """
+    Get base URI for media files.
+
+    Args:
+        media_type: Media type ("image", "audio", "video")
+
+    Returns:
+        Base URI like "viking://resources/images/20250219"
+    """
+    # Map singular media types to plural directory names
+    media_dir_map = {"image": "images", "audio": "audio", "video": "video"}
+    media_dir = media_dir_map.get(media_type, media_type)
+    # Get current date in YYYYMMDD format
+    date_str = datetime.now().strftime("%Y%m%d")
+    return f"viking://resources/{media_dir}/{date_str}"
+
+
+async def generate_image_summary(image_uri: str, original_filename: str) -> Dict[str, Any]:
+    """
+    Generate summary for an image file using VLM.
+
+    Args:
+        image_uri: URI to the image file in VikingFS
+        original_filename: Original filename of the image
+
+    Returns:
+        Dictionary with "name" and "summary" keys
+    """
+    viking_fs = get_viking_fs()
+    vlm = get_openviking_config().vlm
+    file_name = original_filename
+
+    try:
+        # Read image bytes
+        image_bytes = await viking_fs.read_file(image_uri)
+        if not isinstance(image_bytes, bytes):
+            raise ValueError(f"Expected bytes for image file, got {type(image_bytes)}")
+
+        logger.info(
+            f"[MediaUtils.generate_image_summary] Generating summary for image: {image_uri}"
+        )
+
+        # Render prompt
+        prompt = render_prompt(
+            "parsing.image_summary",
+            {"context": "No additional context"},
+        )
+
+        # Call VLM
+        response = await vlm.get_vision_completion_async(
+            prompt=prompt,
+            images=[image_bytes],
+        )
+
+        logger.info(
+            f"[MediaUtils.generate_image_summary] VLM response received, length: {len(response)}"
+        )
+        return {"name": file_name, "summary": response.strip()}
+
+    except Exception as e:
+        logger.error(
+            f"[MediaUtils.generate_image_summary] Failed to generate image summary: {e}",
+            exc_info=True,
+        )
+        return {"name": file_name, "summary": "Image summary generation failed"}
+
+
+async def generate_audio_summary(audio_uri: str, original_filename: str) -> Dict[str, Any]:
+    """
+    Generate summary for an audio file (placeholder).
+
+    Args:
+        audio_uri: URI to the audio file in VikingFS
+        original_filename: Original filename of the audio
+
+    Returns:
+        Dictionary with "name" and "summary" keys
+    """
+    logger.info(
+        f"[MediaUtils.generate_audio_summary] Audio summary generation not yet implemented for: {audio_uri}"
+    )
+    return {"name": original_filename, "summary": "Audio summary generation not yet implemented"}
+
+
+async def generate_video_summary(video_uri: str, original_filename: str) -> Dict[str, Any]:
+    """
+    Generate summary for a video file (placeholder).
+
+    Args:
+        video_uri: URI to the video file in VikingFS
+        original_filename: Original filename of the video
+
+    Returns:
+        Dictionary with "name" and "summary" keys
+    """
+    logger.info(
+        f"[MediaUtils.generate_video_summary] Video summary generation not yet implemented for: {video_uri}"
+    )
+    return {"name": original_filename, "summary": "Video summary generation not yet implemented"}
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
index 84ee468f..a0a07602 100644
--- a/openviking/parse/parsers/media/video.py
+++ b/openviking/parse/parsers/media/video.py
@@ -55,20 +55,7 @@ def supported_extensions(self) -> List[str]:
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
-        Parse video file using three-phase architecture.
-
-        Phase 1: Generate temporary files
-        - Copy original video to temp_uri/content.{ext}
-        - Extract key frames
-        - Extract audio track and transcribe using ASR
-
-        Phase 2: Generate semantic info
-        - Generate abstract and overview based on descriptions
-        - Overview includes file list and usage instructions
-
-        Phase 3: Build directory structure
-        - Move all files to final URI
-        - Generate .abstract.md, .overview.md
+        Parse video file - only copy original file and extract basic metadata, no content understanding.
 
         Args:
             source: Video file path
@@ -142,22 +129,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         fps = 0
         format_str = ext[1:].upper()
 
-        # 1.3 Generate combined description
-        description = ""
-        if self.config.enable_key_frames or self.config.enable_audio_transcription:
-            description = await self._generate_video_description(file_path, self.config)
-        else:
-            # Fallback: basic description
-            description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)"
-
-        # 1.4 Key frames (optional)
-        key_frames_dir = f"{root_dir_uri}/keyframes"
-        has_key_frames = False
-        if self.config.enable_key_frames:
-            await viking_fs.mkdir(key_frames_dir)
-            has_key_frames = True
-
-        # Create ResourceNode
+        # Create ResourceNode - metadata only, no content understanding yet
         root_node = ResourceNode(
             type=NodeType.ROOT,
             title=file_path.stem,
@@ -178,9 +150,6 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             },
         )
 
-        # Phase 2: Generate semantic info
-        await self._generate_semantic_info(root_node, description, viking_fs, has_key_frames)
-
         # Phase 3: Build directory structure (handled by TreeBuilder)
         return ParseResult(
             root=root_node,
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 0e582dab..3310794b 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -24,6 +24,8 @@
 from typing import TYPE_CHECKING, Optional
 
 from openviking.core.building_tree import BuildingTree
+from openviking.parse.parsers.media.utils import get_media_base_uri, get_media_type
+from openviking.storage.queuefs import SemanticMsg, get_queue_manager
 from openviking.storage.viking_fs import get_viking_fs
 from openviking_cli.utils.uri import VikingURI
 
@@ -58,10 +60,15 @@ def __init__(self):
         """Initialize TreeBuilder."""
         pass
 
-    def _get_base_uri(self, scope: str) -> str:
-        """Get base URI for scope."""
-        # Resources are now in independent resources scope
+    def _get_base_uri(
+        self, scope: str, source_path: Optional[str] = None, source_format: Optional[str] = None
+    ) -> str:
+        """Get base URI for scope, with special handling for media files."""
+        # Check if it's a media file first
         if scope == "resources":
+            media_type = get_media_type(source_path, source_format)
+            if media_type:
+                return get_media_base_uri(media_type)
             return "viking://resources"
         if scope == "user":
             # user resources go to memories (no separate resources dir)
@@ -93,6 +100,7 @@ async def finalize_from_temp(
             temp_dir_path: Temporary directory Viking URI (e.g., viking://temp/xxx)
             scope: Scope ("resources", "user", or "agent")
             base_uri: Base URI (None = use scope default)
+            source_node: Source ResourceNode
             source_path: Source file path
             source_format: Source file format
 
@@ -115,70 +123,38 @@ async def finalize_from_temp(
                 f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}"
             )
 
-        from openviking_cli.utils.uri import VikingURI
-
         doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"])
-        doc_uri = f"{temp_uri}/{doc_name}"
+        temp_doc_uri = f"{temp_uri}/{doc_name}"
 
         # 2. Determine base_uri
         if base_uri is None:
-            # Check if it's a media file (image/audio/video)
-            media_type = None
-            if source_format:
-                if source_format in ["image", "audio", "video"]:
-                    media_type = source_format
-            elif source_path:
-                from pathlib import Path
-
-                ext = Path(source_path).suffix.lower()
-                image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
-                audio_exts = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
-                video_exts = [".mp4", ".mov", ".avi", ".webm", ".mkv"]
-                if ext in image_exts:
-                    media_type = "image"
-                elif ext in audio_exts:
-                    media_type = "audio"
-                elif ext in video_exts:
-                    media_type = "video"
-
-            if media_type:
-                # Map singular media types to plural directory names
-                media_dir_map = {"image": "images", "audio": "audio", "video": "video"}
-                media_dir = media_dir_map.get(media_type, media_type)
-                # Get current date in YYYYMMDD format
-                from datetime import datetime
-
-                date_str = datetime.now().strftime("%Y%m%d")
-                base_uri = f"viking://resources/{media_dir}/{date_str}"
-            else:
-                base_uri = self._get_base_uri(scope)
-
-        logger.info(f"Finalizing from temp: {temp_uri} -> {base_uri}")
+            base_uri = self._get_base_uri(scope, source_path, source_format)
 
         # 3. Build final URI, auto-renaming on conflict (e.g. doc_1, doc_2, ...)
         candidate_uri = VikingURI(base_uri).join(doc_name).uri
         final_uri = await self._resolve_unique_uri(candidate_uri)
         if final_uri != candidate_uri:
-            logger.info(f"Resolved name conflict: {candidate_uri} -> {final_uri}")
+            logger.info(f"[TreeBuilder] Resolved name conflict: {candidate_uri} -> {final_uri}")
+        else:
+            logger.info(f"[TreeBuilder] Finalizing from temp: {final_uri}")
 
         # 4. Move directory tree from temp to final location in AGFS
-        await self._move_directory_in_agfs(doc_uri, final_uri)
-        logger.info(f"Moved temp tree: {doc_uri} -> {final_uri}")
+        await self._move_directory_in_agfs(temp_doc_uri, final_uri)
+        logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}")
 
         # 5. Cleanup temporary root directory
         try:
             await viking_fs.delete_temp(temp_uri)
-            logger.info(f"Cleaned up temp root: {temp_uri}")
+            logger.info(f"[TreeBuilder] Cleaned up temp root: {temp_uri}")
         except Exception as e:
-            logger.warning(f"Failed to cleanup temp root: {e}")
+            logger.warning(f"[TreeBuilder] Failed to cleanup temp root: {e}")
 
         # 6. Enqueue to SemanticQueue for async semantic generation
         try:
-            context_type = "resource"  # Default to resource
-            await self._enqueue_semantic_generation(final_uri, context_type)
-            logger.info(f"Enqueued semantic generation for: {final_uri}")
+            await self._enqueue_semantic_generation(final_uri, scope)
+            logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}")
         except Exception as e:
-            logger.error(f"Failed to enqueue semantic generation: {e}", exc_info=True)
+            logger.error(f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True)
 
         # 7. Return simple BuildingTree (no scanning needed)
         tree = BuildingTree(
@@ -187,8 +163,6 @@ async def finalize_from_temp(
         )
         tree._root_uri = final_uri
 
-        logger.info(f"Finalized tree: root_uri={final_uri}")
-
         return tree
 
     async def _resolve_unique_uri(self, uri: str, max_attempts: int = 100) -> str:
@@ -215,9 +189,7 @@ async def _exists(u: str) -> bool:
             if not await _exists(candidate):
                 return candidate
 
-        raise FileExistsError(
-            f"Cannot resolve unique name for {uri} after {max_attempts} attempts"
-        )
+        raise FileExistsError(f"Cannot resolve unique name for {uri} after {max_attempts} attempts")
 
     async def _move_directory_in_agfs(self, src_uri: str, dst_uri: str) -> None:
         """Recursively move AGFS directory tree (copy + delete)."""
@@ -280,7 +252,6 @@ async def _enqueue_semantic_generation(self, uri: str, context_type: str) -> Non
             uri: Directory URI to enqueue
             context_type: resource/memory/skill
         """
-        from openviking.storage.queuefs import SemanticMsg, get_queue_manager
 
         queue_manager = get_queue_manager()
 
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 5904d826..09a3ec77 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -291,12 +291,46 @@ async def _generate_single_file_summary(
         file_name = file_path.split("/")[-1]
 
         try:
-            # Check if this is a binary file that should be skipped
+            # Check if this is a media file first
             from pathlib import Path
 
+            from openviking.parse.parsers.media.utils import (
+                generate_audio_summary,
+                generate_image_summary,
+                generate_video_summary,
+                get_media_type,
+            )
+
             p = Path(file_name)
             extension = p.suffix.lower()
 
+            # Check media type
+            media_type = get_media_type(file_name, None)
+            if media_type:
+                logger.info(
+                    f"[SemanticProcessor] Generating media summary for: {file_path}, type: {media_type}"
+                )
+                # Find the original filename by listing the directory (since file_path is like viking://resources/images/xxx/xxx.png)
+                parent_uri = "/".join(file_path.split("/")[:-1])
+                try:
+                    entries = await viking_fs.ls(parent_uri)
+                    original_filename = file_name  # default to file_name
+                    for entry in entries:
+                        name = entry.get("name", "")
+                        if name and not name.startswith(".") and not entry.get("isDir"):
+                            original_filename = name
+                            break
+                except Exception:
+                    original_filename = file_name
+
+                if media_type == "image":
+                    return await generate_image_summary(file_path, original_filename)
+                elif media_type == "audio":
+                    return await generate_audio_summary(file_path, original_filename)
+                elif media_type == "video":
+                    return await generate_video_summary(file_path, original_filename)
+
+            # Check if this is a binary file that should be skipped
             # Skip binary files (using IGNORE_EXTENSIONS as reference)
             if extension in IGNORE_EXTENSIONS or not is_text_file(file_name):
                 logger.debug(f"Skipping binary file for summary generation: {file_path}")
@@ -345,7 +379,7 @@ async def _generate_single_file_summary(
             return {"name": file_name, "summary": summary.strip()}
 
         except Exception as e:
-            logger.warning(f"Failed to generate summary for {file_path}: {e}")
+            logger.warning(f"Failed to generate summary for {file_path}: {e}", exc_info=True)
             return {"name": file_name, "summary": ""}
 
     def _extract_abstract_from_overview(self, overview_content: str) -> str:
diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py
index 156c48dd..ff240d98 100644
--- a/openviking/utils/resource_processor.py
+++ b/openviking/utils/resource_processor.py
@@ -119,6 +119,7 @@ async def process_resource(
         except Exception as e:
             result["status"] = "error"
             result["errors"].append(f"Parse error: {e}")
+            logger.error(f"[ResourceProcessor] Parse error: {e}")
             return result
 
         # parse_result contains:

From 4590c0286a9dcd5a1a35cbdeecb002c42f99e791 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Fri, 20 Feb 2026 13:04:11 +0800
Subject: [PATCH 14/18] refactor: split _generate_single_file_summary to add
 _generate_text_summary

- Add _generate_text_summary function for text file processing
- Update media utils functions to accept llm_sem and use it to limit concurrent calls
- Update _generate_single_file_summary to call _generate_text_summary and media utils functions
- Fix import ordering
- Fix issue where _generate_file_summaries was creating a new semaphore, now each _generate_single_file_summary handles its own
---
 openviking/parse/parsers/media/utils.py       |  22 ++--
 .../storage/queuefs/semantic_processor.py     | 119 +++++++++---------
 2 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/openviking/parse/parsers/media/utils.py b/openviking/parse/parsers/media/utils.py
index f4acbdf4..d89522e4 100644
--- a/openviking/parse/parsers/media/utils.py
+++ b/openviking/parse/parsers/media/utils.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Media-related utilities for OpenViking."""
 
+import asyncio
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -61,7 +62,9 @@ def get_media_base_uri(media_type: str) -> str:
     return f"viking://resources/{media_dir}/{date_str}"
 
 
-async def generate_image_summary(image_uri: str, original_filename: str) -> Dict[str, Any]:
+async def generate_image_summary(
+    image_uri: str, original_filename: str, llm_sem: Optional[asyncio.Semaphore] = None
+) -> Dict[str, Any]:
     """
     Generate summary for an image file using VLM.
 
@@ -93,10 +96,11 @@ async def generate_image_summary(image_uri: str, original_filename: str) -> Dict
         )
 
         # Call VLM
-        response = await vlm.get_vision_completion_async(
-            prompt=prompt,
-            images=[image_bytes],
-        )
+        async with llm_sem or asyncio.Semaphore(1):
+            response = await vlm.get_vision_completion_async(
+                prompt=prompt,
+                images=[image_bytes],
+            )
 
         logger.info(
             f"[MediaUtils.generate_image_summary] VLM response received, length: {len(response)}"
@@ -111,7 +115,9 @@ async def generate_image_summary(image_uri: str, original_filename: str) -> Dict
         return {"name": file_name, "summary": "Image summary generation failed"}
 
 
-async def generate_audio_summary(audio_uri: str, original_filename: str) -> Dict[str, Any]:
+async def generate_audio_summary(
+    audio_uri: str, original_filename: str, llm_sem: Optional[asyncio.Semaphore] = None
+) -> Dict[str, Any]:
     """
     Generate summary for an audio file (placeholder).
 
@@ -128,7 +134,9 @@ async def generate_audio_summary(audio_uri: str, original_filename: str) -> Dict
     return {"name": original_filename, "summary": "Audio summary generation not yet implemented"}
 
 
-async def generate_video_summary(video_uri: str, original_filename: str) -> Dict[str, Any]:
+async def generate_video_summary(
+    video_uri: str, original_filename: str, llm_sem: Optional[asyncio.Semaphore] = None
+) -> Dict[str, Any]:
     """
     Generate summary for a video file (placeholder).
 
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 09a3ec77..53e77738 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -3,6 +3,7 @@
 """SemanticProcessor: Processes messages from SemanticQueue, generates .abstract.md and .overview.md."""
 
 import asyncio
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 from openviking.core.context import Context, ResourceContentType, Vectorize
@@ -14,6 +15,12 @@
     FILE_TYPE_OTHER,
     IGNORE_EXTENSIONS,
 )
+from openviking.parse.parsers.media.utils import (
+    generate_audio_summary,
+    generate_image_summary,
+    generate_video_summary,
+    get_media_type,
+)
 from openviking.parse.parsers.upload_utils import is_text_file
 from openviking.prompts import render_prompt
 from openviking.storage.queuefs.named_queue import DequeueHandlerBase
@@ -252,11 +259,8 @@ async def _generate_file_summaries(
         if not file_paths:
             return []
 
-        sem = asyncio.Semaphore(self.max_concurrent_llm)
-
         async def generate_one_summary(file_path: str) -> Dict[str, str]:
-            async with sem:
-                summary = await self._generate_single_file_summary(file_path)
+            summary = await self._generate_single_file_summary(file_path)
             if enqueue_files and context_type and parent_uri:
                 try:
                     await self._vectorize_single_file(
@@ -275,6 +279,52 @@ async def generate_one_summary(file_path: str) -> Dict[str, str]:
         tasks = [generate_one_summary(fp) for fp in file_paths]
         return await asyncio.gather(*tasks)
 
+    async def _generate_text_summary(
+        self, file_path: str, file_name: str, llm_sem: asyncio.Semaphore
+    ) -> Dict[str, str]:
+        """Generate summary for a single text file (code, documentation, or other text)."""
+        viking_fs = get_viking_fs()
+        vlm = get_openviking_config().vlm
+
+        # Read file content (limit length)
+        content = await viking_fs.read_file(file_path)
+        if isinstance(content, bytes):
+            # Try to decode with error handling for text files
+            try:
+                content = content.decode("utf-8")
+            except UnicodeDecodeError:
+                logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}")
+                return {"name": file_name, "summary": ""}
+
+        # Limit content length (about 10000 tokens)
+        max_chars = 30000
+        if len(content) > max_chars:
+            content = content[:max_chars] + "\n...(truncated)"
+
+        # Generate summary
+        if not vlm.is_available():
+            logger.warning("VLM not available, using empty summary")
+            return {"name": file_name, "summary": ""}
+
+        # Detect file type and select appropriate prompt
+        file_type = self._detect_file_type(file_name)
+
+        if file_type == FILE_TYPE_CODE:
+            prompt_id = "semantic.code_summary"
+        elif file_type == FILE_TYPE_DOCUMENTATION:
+            prompt_id = "semantic.document_summary"
+        else:
+            prompt_id = "semantic.file_summary"
+
+        prompt = render_prompt(
+            prompt_id,
+            {"file_name": file_name, "content": content},
+        )
+
+        async with llm_sem:
+            summary = await vlm.get_completion_async(prompt)
+        return {"name": file_name, "summary": summary.strip()}
+
     async def _generate_single_file_summary(
         self, file_path: str, llm_sem: Optional[asyncio.Semaphore] = None
     ) -> Dict[str, str]:
@@ -287,20 +337,12 @@ async def _generate_single_file_summary(
             {"name": file_name, "summary": summary_content}
         """
         viking_fs = get_viking_fs()
-        vlm = get_openviking_config().vlm
         file_name = file_path.split("/")[-1]
 
+        llm_sem = llm_sem or asyncio.Semaphore(self.max_concurrent_llm)
+
         try:
             # Check if this is a media file first
-            from pathlib import Path
-
-            from openviking.parse.parsers.media.utils import (
-                generate_audio_summary,
-                generate_image_summary,
-                generate_video_summary,
-                get_media_type,
-            )
-
             p = Path(file_name)
             extension = p.suffix.lower()
 
@@ -324,11 +366,11 @@ async def _generate_single_file_summary(
                     original_filename = file_name
 
                 if media_type == "image":
-                    return await generate_image_summary(file_path, original_filename)
+                    return await generate_image_summary(file_path, original_filename, llm_sem)
                 elif media_type == "audio":
-                    return await generate_audio_summary(file_path, original_filename)
+                    return await generate_audio_summary(file_path, original_filename, llm_sem)
                 elif media_type == "video":
-                    return await generate_video_summary(file_path, original_filename)
+                    return await generate_video_summary(file_path, original_filename, llm_sem)
 
             # Check if this is a binary file that should be skipped
             # Skip binary files (using IGNORE_EXTENSIONS as reference)
@@ -336,47 +378,8 @@ async def _generate_single_file_summary(
                 logger.debug(f"Skipping binary file for summary generation: {file_path}")
                 return {"name": file_name, "summary": ""}
 
-            # Read file content (limit length)
-            content = await viking_fs.read_file(file_path)
-            if isinstance(content, bytes):
-                # Try to decode with error handling for text files
-                try:
-                    content = content.decode("utf-8")
-                except UnicodeDecodeError:
-                    logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}")
-                    return {"name": file_name, "summary": ""}
-
-            # Limit content length (about 10000 tokens)
-            max_chars = 30000
-            if len(content) > max_chars:
-                content = content[:max_chars] + "\n...(truncated)"
-
-            # Generate summary
-            if not vlm.is_available():
-                logger.warning("VLM not available, using empty summary")
-                return {"name": file_name, "summary": ""}
-
-            # Detect file type and select appropriate prompt
-            file_type = self._detect_file_type(file_name)
-
-            if file_type == FILE_TYPE_CODE:
-                prompt_id = "semantic.code_summary"
-            elif file_type == FILE_TYPE_DOCUMENTATION:
-                prompt_id = "semantic.document_summary"
-            else:
-                prompt_id = "semantic.file_summary"
-
-            prompt = render_prompt(
-                prompt_id,
-                {"file_name": file_name, "content": content},
-            )
-
-            if llm_sem:
-                async with llm_sem:
-                    summary = await vlm.get_completion_async(prompt)
-            else:
-                summary = await vlm.get_completion_async(prompt)
-            return {"name": file_name, "summary": summary.strip()}
+            # Process text file
+            return await self._generate_text_summary(file_path, file_name, llm_sem)
 
         except Exception as e:
             logger.warning(f"Failed to generate summary for {file_path}: {e}", exc_info=True)

From dd3e64e740f190638c47c28dee69adf57e288990 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Fri, 20 Feb 2026 13:36:57 +0800
Subject: [PATCH 15/18] feat: vlm optimization for image

---
 openviking/parse/parsers/media/utils.py       |  2 +-
 openviking/parse/tree_builder.py              |  2 +-
 .../storage/queuefs/semantic_processor.py     | 56 +++----------------
 3 files changed, 10 insertions(+), 50 deletions(-)

diff --git a/openviking/parse/parsers/media/utils.py b/openviking/parse/parsers/media/utils.py
index d89522e4..1e8ad30d 100644
--- a/openviking/parse/parsers/media/utils.py
+++ b/openviking/parse/parsers/media/utils.py
@@ -81,7 +81,7 @@ async def generate_image_summary(
 
     try:
         # Read image bytes
-        image_bytes = await viking_fs.read_file(image_uri)
+        image_bytes = await viking_fs.read_file_bytes(image_uri)
         if not isinstance(image_bytes, bytes):
             raise ValueError(f"Expected bytes for image file, got {type(image_bytes)}")
 
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 3310794b..7d3f5a89 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -151,7 +151,7 @@ async def finalize_from_temp(
 
         # 6. Enqueue to SemanticQueue for async semantic generation
         try:
-            await self._enqueue_semantic_generation(final_uri, scope)
+            await self._enqueue_semantic_generation(final_uri, "resource")
             logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}")
         except Exception as e:
             logger.error(f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True)
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 53e77738..cfad652d 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -3,7 +3,6 @@
 """SemanticProcessor: Processes messages from SemanticQueue, generates .abstract.md and .overview.md."""
 
 import asyncio
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 from openviking.core.context import Context, ResourceContentType, Vectorize
@@ -13,7 +12,6 @@
     FILE_TYPE_CODE,
     FILE_TYPE_DOCUMENTATION,
     FILE_TYPE_OTHER,
-    IGNORE_EXTENSIONS,
 )
 from openviking.parse.parsers.media.utils import (
     generate_audio_summary,
@@ -21,7 +19,6 @@
     generate_video_summary,
     get_media_type,
 )
-from openviking.parse.parsers.upload_utils import is_text_file
 from openviking.prompts import render_prompt
 from openviking.storage.queuefs.named_queue import DequeueHandlerBase
 from openviking.storage.queuefs.semantic_dag import DagStats, SemanticDagExecutor
@@ -336,55 +333,18 @@ async def _generate_single_file_summary(
         Returns:
             {"name": file_name, "summary": summary_content}
         """
-        viking_fs = get_viking_fs()
         file_name = file_path.split("/")[-1]
-
         llm_sem = llm_sem or asyncio.Semaphore(self.max_concurrent_llm)
-
-        try:
-            # Check if this is a media file first
-            p = Path(file_name)
-            extension = p.suffix.lower()
-
-            # Check media type
-            media_type = get_media_type(file_name, None)
-            if media_type:
-                logger.info(
-                    f"[SemanticProcessor] Generating media summary for: {file_path}, type: {media_type}"
-                )
-                # Find the original filename by listing the directory (since file_path is like viking://resources/images/xxx/xxx.png)
-                parent_uri = "/".join(file_path.split("/")[:-1])
-                try:
-                    entries = await viking_fs.ls(parent_uri)
-                    original_filename = file_name  # default to file_name
-                    for entry in entries:
-                        name = entry.get("name", "")
-                        if name and not name.startswith(".") and not entry.get("isDir"):
-                            original_filename = name
-                            break
-                except Exception:
-                    original_filename = file_name
-
-                if media_type == "image":
-                    return await generate_image_summary(file_path, original_filename, llm_sem)
-                elif media_type == "audio":
-                    return await generate_audio_summary(file_path, original_filename, llm_sem)
-                elif media_type == "video":
-                    return await generate_video_summary(file_path, original_filename, llm_sem)
-
-            # Check if this is a binary file that should be skipped
-            # Skip binary files (using IGNORE_EXTENSIONS as reference)
-            if extension in IGNORE_EXTENSIONS or not is_text_file(file_name):
-                logger.debug(f"Skipping binary file for summary generation: {file_path}")
-                return {"name": file_name, "summary": ""}
-
-            # Process text file
+        media_type = get_media_type(file_name, None)
+        if media_type == "image":
+            return await generate_image_summary(file_path, file_name, llm_sem)
+        elif media_type == "audio":
+            return await generate_audio_summary(file_path, file_name, llm_sem)
+        elif media_type == "video":
+            return await generate_video_summary(file_path, file_name, llm_sem)
+        else:
             return await self._generate_text_summary(file_path, file_name, llm_sem)
 
-        except Exception as e:
-            logger.warning(f"Failed to generate summary for {file_path}: {e}", exc_info=True)
-            return {"name": file_name, "summary": ""}
-
     def _extract_abstract_from_overview(self, overview_content: str) -> str:
         """Extract abstract from overview.md."""
         lines = overview_content.split("\n")

From d167ff14fe4e4d7f97baa959f151621b86639694 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Fri, 20 Feb 2026 13:39:40 +0800
Subject: [PATCH 16/18] feat: vlm optimization for image

---
 tests/parse/test_directory_parser_routing.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/parse/test_directory_parser_routing.py b/tests/parse/test_directory_parser_routing.py
index 88ae61bb..349fd2f9 100644
--- a/tests/parse/test_directory_parser_routing.py
+++ b/tests/parse/test_directory_parser_routing.py
@@ -174,8 +174,7 @@ def test_scan_classifies_all_files_correctly(
         for ext in self.TEXT_FALLBACK_EXTENSIONS:
             assert ext in processable_exts, f"{ext} should be processable (text-fallback)"
 
-        # .bmp and .rar are unsupported
-        assert ".bmp" in unsupported_exts
+        # .rar are unsupported
         assert ".rar" in unsupported_exts
 
     def test_each_processable_file_has_a_parser_or_is_text(

From 9c561a4f78deadc04695d796b9b8031523824832 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Sat, 21 Feb 2026 11:00:52 +0800
Subject: [PATCH 17/18] Implement smart dual-mode for add-resource and
 import-ovpack, and config system improvements

---
 examples/ov.conf.example                      |  3 +-
 examples/server_client/ov.conf.example        |  5 +-
 openviking/server/routers/pack.py             | 12 ++-
 openviking/server/routers/resources.py        | 54 +++++++++++-
 openviking/utils/media_processor.py           | 11 +++
 openviking_cli/client/http.py                 | 88 ++++++++++++++++---
 openviking_cli/utils/config/agfs_config.py    |  8 +-
 .../utils/config/open_viking_config.py        |  7 +-
 openviking_cli/utils/config/storage_config.py | 65 +++++++++++++-
 .../utils/config/vectordb_config.py           |  8 +-
 10 files changed, 225 insertions(+), 36 deletions(-)

diff --git a/examples/ov.conf.example b/examples/ov.conf.example
index 34cbc6a4..06822bad 100644
--- a/examples/ov.conf.example
+++ b/examples/ov.conf.example
@@ -6,10 +6,10 @@
     "cors_origins": ["*"]
   },
   "storage": {
+    "workspace": "./data",
     "vectordb": {
       "name": "context",
       "backend": "local",
-      "path": "./data",
       "volcengine": {
         "region": "cn-beijing",
         "ak": null,
@@ -19,7 +19,6 @@
     "agfs": {
       "port": 1833,
       "log_level": "warn",
-      "path": "./data",
       "backend": "local",
       "timeout": 10,
       "retry_times": 3,
diff --git a/examples/server_client/ov.conf.example b/examples/server_client/ov.conf.example
index 13eb55db..57fe2ef6 100644
--- a/examples/server_client/ov.conf.example
+++ b/examples/server_client/ov.conf.example
@@ -6,15 +6,14 @@
     "cors_origins": ["*"]
   },
   "storage": {
+    "workspace": "./data",
     "vectordb": {
       "name": "context",
-      "backend": "local",
-      "path": "./data"
+      "backend": "local"
     },
     "agfs": {
       "port": 1833,
       "log_level": "warn",
-      "path": "./data",
       "backend": "local"
     }
   },
diff --git a/openviking/server/routers/pack.py b/openviking/server/routers/pack.py
index e486870b..6a29d4ce 100644
--- a/openviking/server/routers/pack.py
+++ b/openviking/server/routers/pack.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Pack endpoints for OpenViking HTTP Server."""
 
+from typing import Optional
+
 from fastapi import APIRouter, Depends
 from pydantic import BaseModel
 
@@ -22,7 +24,8 @@ class ExportRequest(BaseModel):
 class ImportRequest(BaseModel):
     """Request model for import."""
 
-    file_path: str
+    file_path: Optional[str] = None
+    temp_path: Optional[str] = None
     parent: str
     force: bool = False
     vectorize: bool = True
@@ -46,8 +49,13 @@ async def import_ovpack(
 ):
     """Import .ovpack file."""
     service = get_service()
+
+    file_path = request.file_path
+    if request.temp_path:
+        file_path = request.temp_path
+
     result = await service.pack.import_ovpack(
-        request.file_path,
+        file_path,
         request.parent,
         force=request.force,
         vectorize=request.vectorize,
diff --git a/openviking/server/routers/resources.py b/openviking/server/routers/resources.py
index 7291dc2f..b1705988 100644
--- a/openviking/server/routers/resources.py
+++ b/openviking/server/routers/resources.py
@@ -2,14 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 """Resource endpoints for OpenViking HTTP Server."""
 
+import time
+import uuid
+from pathlib import Path
 from typing import Any, Optional
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, File, UploadFile
 from pydantic import BaseModel
 
 from openviking.server.auth import verify_api_key
 from openviking.server.dependencies import get_service
 from openviking.server.models import Response
+from openviking_cli.utils.config.open_viking_config import get_openviking_config
 
 router = APIRouter(prefix="/api/v1", tags=["resources"])
 
@@ -17,7 +21,8 @@
 class AddResourceRequest(BaseModel):
     """Request model for add_resource."""
 
-    path: str
+    path: Optional[str] = None
+    temp_path: Optional[str] = None
     target: Optional[str] = None
     reason: str = ""
     instruction: str = ""
@@ -33,6 +38,44 @@ class AddSkillRequest(BaseModel):
     timeout: Optional[float] = None
 
 
+def _cleanup_temp_files(temp_dir: Path, max_age_hours: int = 1):
+    """Clean up temporary files older than max_age_hours."""
+    if not temp_dir.exists():
+        return
+
+    now = time.time()
+    max_age_seconds = max_age_hours * 3600
+
+    for file_path in temp_dir.iterdir():
+        if file_path.is_file():
+            file_age = now - file_path.stat().st_mtime
+            if file_age > max_age_seconds:
+                file_path.unlink(missing_ok=True)
+
+
+@router.post("/resources/temp_upload")
+async def temp_upload(
+    file: UploadFile = File(...),
+    _: bool = Depends(verify_api_key),
+):
+    """Upload a temporary file for add_resource or import_ovpack."""
+    config = get_openviking_config()
+    temp_dir = config.storage.get_upload_temp_dir()
+
+    # Clean up old temporary files
+    _cleanup_temp_files(temp_dir)
+
+    # Save the uploaded file
+    file_ext = Path(file.filename).suffix if file.filename else ".tmp"
+    temp_filename = f"upload_{uuid.uuid4().hex}{file_ext}"
+    temp_file_path = temp_dir / temp_filename
+
+    with open(temp_file_path, "wb") as f:
+        f.write(await file.read())
+
+    return Response(status="ok", result={"temp_path": str(temp_file_path)})
+
+
 @router.post("/resources")
 async def add_resource(
     request: AddResourceRequest,
@@ -40,8 +83,13 @@ async def add_resource(
 ):
     """Add resource to OpenViking."""
     service = get_service()
+
+    path = request.path
+    if request.temp_path:
+        path = request.temp_path
+
     result = await service.resources.add_resource(
-        path=request.path,
+        path=path,
         target=request.target,
         reason=request.reason,
         instruction=request.instruction,
diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py
index 3ff58b2e..3e2475bc 100644
--- a/openviking/utils/media_processor.py
+++ b/openviking/utils/media_processor.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Unified resource processor with strategy-based routing."""
 
+import tempfile
+import zipfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -103,6 +105,15 @@ async def _process_file(
         instruction: str,
     ) -> ParseResult:
         """Process file with unified parsing."""
+        # Check if it's a zip file
+        if zipfile.is_zipfile(file_path):
+            temp_dir = Path(tempfile.mkdtemp())
+            try:
+                with zipfile.ZipFile(file_path, "r") as zipf:
+                    zipf.extractall(temp_dir)
+                return await self._process_directory(temp_dir, instruction)
+            finally:
+                pass  # Don't delete temp_dir yet, it will be used by TreeBuilder
         return await parse(
             str(file_path),
             instruction=instruction,
diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py
index a5cb6903..b223ffc2 100644
--- a/openviking_cli/client/http.py
+++ b/openviking_cli/client/http.py
@@ -5,6 +5,10 @@
 Implements BaseClient interface using HTTP calls to OpenViking Server.
 """
 
+import tempfile
+import uuid
+import zipfile
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 import httpx
@@ -219,6 +223,42 @@ def _raise_exception(self, error: Dict[str, Any]) -> None:
         else:
             raise exc_class(message)
 
+    def _is_local_server(self) -> bool:
+        """Check if the server URL is localhost or 127.0.0.1."""
+        from urllib.parse import urlparse
+
+        parsed_url = urlparse(self._url)
+        hostname = parsed_url.hostname
+        return hostname in ("localhost", "127.0.0.1")
+
+    def _zip_directory(self, dir_path: str) -> str:
+        """Create a temporary zip file from a directory."""
+        dir_path = Path(dir_path)
+        if not dir_path.is_dir():
+            raise ValueError(f"Path {dir_path} is not a directory")
+
+        temp_dir = tempfile.gettempdir()
+        zip_path = Path(temp_dir) / f"temp_upload_{uuid.uuid4().hex}.zip"
+
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            for file_path in dir_path.rglob("*"):
+                if file_path.is_file():
+                    arcname = file_path.relative_to(dir_path)
+                    zipf.write(file_path, arcname=arcname)
+
+        return str(zip_path)
+
+    async def _upload_temp_file(self, file_path: str) -> str:
+        """Upload a file to /api/v1/resources/temp_upload and return the temp_path."""
+        with open(file_path, "rb") as f:
+            files = {"file": (Path(file_path).name, f, "application/octet-stream")}
+            response = await self._http.post(
+                "/api/v1/resources/temp_upload",
+                files=files,
+            )
+        result = self._handle_response(response)
+        return result.get("temp_path", "")
+
     # ============= Resource Management =============
 
     async def add_resource(
@@ -231,16 +271,28 @@ async def add_resource(
         timeout: Optional[float] = None,
     ) -> Dict[str, Any]:
         """Add resource to OpenViking."""
+        request_data = {
+            "target": target,
+            "reason": reason,
+            "instruction": instruction,
+            "wait": wait,
+            "timeout": timeout,
+        }
+
+        path_obj = Path(path)
+        if path_obj.exists() and path_obj.is_dir() and not self._is_local_server():
+            zip_path = self._zip_directory(path)
+            try:
+                temp_path = await self._upload_temp_file(zip_path)
+                request_data["temp_path"] = temp_path
+            finally:
+                Path(zip_path).unlink(missing_ok=True)
+        else:
+            request_data["path"] = path
+
         response = await self._http.post(
             "/api/v1/resources",
-            json={
-                "path": path,
-                "target": target,
-                "reason": reason,
-                "instruction": instruction,
-                "wait": wait,
-                "timeout": timeout,
-            },
+            json=request_data,
         )
         return self._handle_response(response)
 
@@ -554,14 +606,22 @@ async def import_ovpack(
     ) -> str:
         """Import .ovpack file."""
         parent = VikingURI.normalize(parent)
+        request_data = {
+            "parent": parent,
+            "force": force,
+            "vectorize": vectorize,
+        }
+
+        file_path_obj = Path(file_path)
+        if file_path_obj.exists() and file_path_obj.is_file() and not self._is_local_server():
+            temp_path = await self._upload_temp_file(file_path)
+            request_data["temp_path"] = temp_path
+        else:
+            request_data["file_path"] = file_path
+
         response = await self._http.post(
             "/api/v1/pack/import",
-            json={
-                "file_path": file_path,
-                "parent": parent,
-                "force": force,
-                "vectorize": vectorize,
-            },
+            json=request_data,
         )
         result = self._handle_response(response)
         return result.get("uri", "")
diff --git a/openviking_cli/utils/config/agfs_config.py b/openviking_cli/utils/config/agfs_config.py
index de7421e1..4f2d3d72 100644
--- a/openviking_cli/utils/config/agfs_config.py
+++ b/openviking_cli/utils/config/agfs_config.py
@@ -66,7 +66,10 @@ def validate_config(self):
 class AGFSConfig(BaseModel):
     """Configuration for AGFS (Agent Global File System)."""
 
-    path: str = Field(default="./data", description="AGFS data storage path")
+    path: Optional[str] = Field(
+        default=None,
+        description="[Deprecated in favor of `storage.workspace`] AGFS data storage path. This will be ignored if `storage.workspace` is set.",
+    )
 
     port: int = Field(default=1833, description="AGFS service port")
 
@@ -105,8 +108,7 @@ def validate_config(self):
             )
 
         if self.backend == "local":
-            if not self.path:
-                raise ValueError("AGFS local backend requires 'path' to be set")
+            pass
 
         elif self.backend == "s3":
             # Validate S3 configuration
diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py
index ac615e9c..14a36b74 100644
--- a/openviking_cli/utils/config/open_viking_config.py
+++ b/openviking_cli/utils/config/open_viking_config.py
@@ -138,7 +138,7 @@ def from_dict(cls, config: Dict[str, Any]) -> "OpenVikingConfig":
 
         # Remove sections managed by other loaders (e.g. server config)
         config_copy.pop("server", None)
-        
+
         # Handle parser configurations from nested "parsers" section
         parser_configs = {}
         if "parsers" in config_copy:
@@ -316,7 +316,7 @@ def initialize_openviking_config(
 
     Args:
         user: UserIdentifier for session management
-        path: Local storage path for embedded mode
+        path: Local storage path (workspace) for embedded mode
 
     Returns:
         Configured OpenVikingConfig instance
@@ -337,9 +337,8 @@ def initialize_openviking_config(
     if path:
         # Embedded mode: local storage
         config.storage.agfs.backend = config.storage.agfs.backend or "local"
-        config.storage.agfs.path = path
         config.storage.vectordb.backend = config.storage.vectordb.backend or "local"
-        config.storage.vectordb.path = path
+        config.storage.workspace = path
 
     # Ensure vector dimension is synced if not set in storage
     if config.storage.vectordb.dimension == 0:
diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py
index b6ce378a..4682a10f 100644
--- a/openviking_cli/utils/config/storage_config.py
+++ b/openviking_cli/utils/config/storage_config.py
@@ -1,15 +1,27 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
 from typing import Any, Dict
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
+
+from openviking_cli.utils.logger import get_logger
 
 from .agfs_config import AGFSConfig
 from .vectordb_config import VectorDBBackendConfig
 
+logger = get_logger(__name__)
+
 
 class StorageConfig(BaseModel):
-    """Configuration for storage backend."""
+    """Configuration for storage backend.
+
+    The `workspace` field is the primary configuration for local data storage.
+    When `workspace` is set, it overrides the deprecated `path` fields in
+    `agfs` and `vectordb` configurations.
+    """
+
+    workspace: str = Field(default="./data", description="Local data storage path (primary)")
 
     agfs: AGFSConfig = Field(default_factory=lambda: AGFSConfig(), description="AGFS configuration")
 
@@ -23,3 +35,52 @@ class StorageConfig(BaseModel):
     )
 
     model_config = {"extra": "forbid"}
+
+    @model_validator(mode="after")
+    def resolve_paths(self):
+        """Resolve path conflicts between workspace and individual path configs.
+
+        When workspace is set:
+        - Ignore agfs.path and vectordb.path
+        - Set agfs.path to {workspace}/.agfs
+        - Set vectordb.path to {workspace}/vectordb
+        - Warn if agfs.path or vectordb.path were explicitly set to different values
+        """
+        workspace_path = Path(self.workspace).resolve()
+
+        # Check for AGFS path conflict
+        if self.agfs.path is not None:  # User explicitly set agfs.path
+            agfs_path = Path(self.agfs.path).resolve()
+            expected_agfs_path = workspace_path / ".agfs"
+            if agfs_path != expected_agfs_path:
+                logger.warning(
+                    f"StorageConfig: 'agfs.path' is deprecated and will be ignored. "
+                    f"Using '{expected_agfs_path}' from workspace instead of '{agfs_path}'"
+                )
+
+        # Check for VectorDB path conflict
+        if self.vectordb.path is not None:  # User explicitly set vectordb.path
+            vectordb_path = Path(self.vectordb.path).resolve()
+            expected_vectordb_path = workspace_path / "vectordb"
+            if vectordb_path != expected_vectordb_path:
+                logger.warning(
+                    f"StorageConfig: 'vectordb.path' is deprecated and will be ignored. "
+                    f"Using '{expected_vectordb_path}' from workspace instead of '{vectordb_path}'"
+                )
+
+        # Update paths to use workspace
+        self.agfs.path = str(workspace_path / ".agfs")
+        self.vectordb.path = str(workspace_path / "vectordb")
+
+        return self
+
+    def get_upload_temp_dir(self) -> Path:
+        """Get the temporary directory for file uploads.
+
+        Returns:
+            Path to {workspace}/temp/upload directory
+        """
+        workspace_path = Path(self.workspace).resolve()
+        upload_temp_dir = workspace_path / "temp" / "upload"
+        upload_temp_dir.mkdir(parents=True, exist_ok=True)
+        return upload_temp_dir
diff --git a/openviking_cli/utils/config/vectordb_config.py b/openviking_cli/utils/config/vectordb_config.py
index 6984322c..2e2ccd42 100644
--- a/openviking_cli/utils/config/vectordb_config.py
+++ b/openviking_cli/utils/config/vectordb_config.py
@@ -46,7 +46,10 @@ class VectorDBBackendConfig(BaseModel):
 
     name: Optional[str] = Field(default=COLLECTION_NAME, description="Collection name for VectorDB")
 
-    path: Optional[str] = Field(default="./data", description="Local storage path for 'local' type")
+    path: Optional[str] = Field(
+        default=None,
+        description="[Deprecated in favor of `storage.workspace`] Local storage path for 'local' type. This will be ignored if `storage.workspace` is set.",
+    )
 
     url: Optional[str] = Field(
         default=None,
@@ -93,8 +96,7 @@ def validate_config(self):
             )
 
         if self.backend == "local":
-            if not self.path:
-                raise ValueError("VectorDB local backend requires 'path' to be set")
+            pass
 
         elif self.backend == "http":
             if not self.url:

From b100a2b7d4f65eb6d615ee20b267172a0917ad22 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Sun, 22 Feb 2026 18:16:15 +0800
Subject: [PATCH 18/18] feat: support local upload

---
 crates/ov_cli/README.md                       |  2 +
 openviking/parse/directory_scan.py            |  5 +-
 openviking_cli/utils/config/storage_config.py | 49 +++++++------------
 pyproject.toml                                |  1 +
 uv.lock                                       | 11 +++++
 5 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/crates/ov_cli/README.md b/crates/ov_cli/README.md
index 12115a0f..e14b0a71 100644
--- a/crates/ov_cli/README.md
+++ b/crates/ov_cli/README.md
@@ -13,6 +13,8 @@ curl -fsSL https://raw.githubusercontent.com/volcengine/OpenViking/main/crates/o
 ### From Source
 
 ```bash
+# openviking need rust >= 1.88, please upgrade it if necessary
+# brew upgrade rust
 cargo install --path crates/ov_cli
 ```
 
diff --git a/openviking/parse/directory_scan.py b/openviking/parse/directory_scan.py
index 8da532f2..07b10283 100644
--- a/openviking/parse/directory_scan.py
+++ b/openviking/parse/directory_scan.py
@@ -175,7 +175,7 @@ def _classify_file(
 def scan_directory(
     root: Union[str, Path],
     registry: Optional[ParserRegistry] = None,
-    strict: bool = True,
+    strict: bool = False,
     ignore_dirs: Optional[Set[str]] = None,
     include: Optional[str] = None,
     exclude: Optional[str] = None,
@@ -272,7 +272,10 @@ def scan_directory(
             f"Unsupported: {unsupported_paths[:10]}{'...' if len(unsupported_paths) > 10 else ''}"
         )
         if strict:
+            logger.error(msg)
             raise UnsupportedDirectoryFilesError(msg, unsupported_paths)
+        else:
+            logger.warning(msg)
         result.warnings.append(msg)
         for rel in unsupported_paths:
             result.warnings.append(f"  - {rel}")
diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py
index 4682a10f..8daf6a79 100644
--- a/openviking_cli/utils/config/storage_config.py
+++ b/openviking_cli/utils/config/storage_config.py
@@ -38,40 +38,25 @@ class StorageConfig(BaseModel):
 
     @model_validator(mode="after")
     def resolve_paths(self):
-        """Resolve path conflicts between workspace and individual path configs.
-
-        When workspace is set:
-        - Ignore agfs.path and vectordb.path
-        - Set agfs.path to {workspace}/.agfs
-        - Set vectordb.path to {workspace}/vectordb
-        - Warn if agfs.path or vectordb.path were explicitly set to different values
-        """
-        workspace_path = Path(self.workspace).resolve()
-
-        # Check for AGFS path conflict
-        if self.agfs.path is not None:  # User explicitly set agfs.path
-            agfs_path = Path(self.agfs.path).resolve()
-            expected_agfs_path = workspace_path / ".agfs"
-            if agfs_path != expected_agfs_path:
-                logger.warning(
-                    f"StorageConfig: 'agfs.path' is deprecated and will be ignored. "
-                    f"Using '{expected_agfs_path}' from workspace instead of '{agfs_path}'"
-                )
-
-        # Check for VectorDB path conflict
-        if self.vectordb.path is not None:  # User explicitly set vectordb.path
-            vectordb_path = Path(self.vectordb.path).resolve()
-            expected_vectordb_path = workspace_path / "vectordb"
-            if vectordb_path != expected_vectordb_path:
-                logger.warning(
-                    f"StorageConfig: 'vectordb.path' is deprecated and will be ignored. "
-                    f"Using '{expected_vectordb_path}' from workspace instead of '{vectordb_path}'"
-                )
+        if self.agfs.path is not None:
+            logger.warning(
+                f"StorageConfig: 'agfs.path' is deprecated and will be ignored. "
+                f"Using '{self.workspace}' from workspace instead of '{self.agfs.path}'"
+            )
+
+        if self.vectordb.path is not None:
+            logger.warning(
+                f"StorageConfig: 'vectordb.path' is deprecated and will be ignored. "
+                f"Using '{self.workspace}' from workspace instead of '{self.vectordb.path}'"
+            )
 
         # Update paths to use workspace
-        self.agfs.path = str(workspace_path / ".agfs")
-        self.vectordb.path = str(workspace_path / "vectordb")
-
+        workspace_path = Path(self.workspace).resolve()
+        workspace_path.mkdir(parents=True, exist_ok=True)
+        self.workspace = str(workspace_path)
+        self.agfs.path = self.workspace
+        self.vectordb.path = self.workspace
+        # logger.info(f"StorageConfig: Using workspace '{self.workspace}' for storage")
         return self
 
     def get_upload_temp_dir(self) -> Path:
diff --git a/pyproject.toml b/pyproject.toml
index b6efd649..0924e749 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ dependencies = [
     "pdfminer-six>=20251230",
     "typer>=0.12.0",
     "litellm>=1.0.0",
+    "python-multipart>=0.0.22",
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index 07a6ba5d..678904d5 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1915,6 +1915,7 @@ dependencies = [
     { name = "pyagfs" },
     { name = "pydantic" },
     { name = "python-docx" },
+    { name = "python-multipart" },
     { name = "python-pptx" },
     { name = "pyyaml" },
     { name = "readabilipy" },
@@ -1973,6 +1974,7 @@ requires-dist = [
     { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=0.21.0" },
     { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" },
     { name = "python-docx", specifier = ">=1.0.0" },
+    { name = "python-multipart", specifier = ">=0.0.22" },
     { name = "python-pptx", specifier = ">=1.0.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "readabilipy", specifier = ">=0.2.0" },
@@ -2588,6 +2590,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
 ]
 
+[[package]]
+name = "python-multipart"
+version = "0.0.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" },
+]
+
 [[package]]
 name = "python-pptx"
 version = "1.0.2"