diff --git a/docs/design/multi-tenant-design.md b/docs/design/multi-tenant-design.md index 6c5ad43d..9a131ac2 100644 --- a/docs/design/multi-tenant-design.md +++ b/docs/design/multi-tenant-design.md @@ -283,7 +283,7 @@ def agent_space_name(self) -> str: | `agent/instructions` | `/{account_id}/agent/{agent_space}/instructions/` | account + user + agent | agent 的行为规则,每用户独立 | | `resources/` | `/{account_id}/resources/` | account | account 内共享的知识资源 | | `session/` | `/{account_id}/session/{user_space}/{session_id}/` | account + user | 用户的对话记录 | -| `transactions/` | `/{account_id}/transactions/` | account | 账户级事务记录 | +| `redo/` | `/{account_id}/_system/redo/` | account | 崩溃恢复 redo 标记 | | `_system/`(全局) | `/_system/` | 系统级 | 全局工作区列表 | | `_system/`(per-account) | `/{account_id}/_system/` | account | 用户注册表 | diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md new file mode 100644 index 00000000..edbda724 --- /dev/null +++ b/docs/en/concepts/09-transaction.md @@ -0,0 +1,363 @@ +# Path Locks and Crash Recovery + +OpenViking uses two simple primitives — **path locks** and **redo log** — to protect the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur. + +## Design Philosophy + +OpenViking is a context database where FS is the source of truth and VectorDB is a derived index. A lost index can be rebuilt from source data, but lost source data is unrecoverable. Therefore: + +> **Better to miss a search result than to return a bad one.** + +## Design Principles + +1. **Write-exclusive**: Path locks ensure only one write operation can operate on a path at a time +2. **On by default**: All data operations automatically acquire locks; no extra configuration needed +3. **Lock as protection**: LockContext acquires locks on entry, releases on exit — no undo/journal/commit semantics +4. **Only session_memory needs crash recovery**: RedoLog re-executes memory extraction after a process crash +5. **Queue operations run outside locks**: SemanticQueue/EmbeddingQueue enqueue operations are idempotent and retriable + +## Architecture + +``` +Service Layer (rm / mv / add_resource / session.commit) + | + v ++--[LockContext async context manager]--+ +| | +| 1. Create LockHandle | +| 2. Acquire path lock (poll+timeout) | +| 3. Execute operations (FS+VectorDB) | +| 4. Release lock | +| | +| On exception: auto-release lock, | +| exception propagates unchanged | ++---------------------------------------+ + | + v +Storage Layer (VikingFS, VectorDB, QueueManager) +``` + +## Two Core Components + +### Component 1: PathLock + LockManager + LockContext (Path Lock System) + +**PathLock** implements file-based distributed locks with two lock types — POINT and SUBTREE — using fencing tokens to prevent TOCTOU races and automatic stale lock detection and cleanup. + +**LockHandle** is a lightweight lock holder token: + +```python +@dataclass +class LockHandle: + id: str # Unique ID used to generate fencing tokens + locks: list[str] # Acquired lock file paths + created_at: float # Creation time +``` + +**LockManager** is a global singleton managing lock lifecycle: +- Creates/releases LockHandles +- Background cleanup of leaked locks (in-process safety net) +- Executes RedoLog recovery on startup + +**LockContext** is an async context manager encapsulating the lock/unlock lifecycle: + +```python +from openviking.storage.transaction import LockContext, get_lock_manager + +async with LockContext(get_lock_manager(), [path], lock_mode="point") as handle: + # Perform operations under lock protection + ... +# Lock automatically released on exit (including exceptions) +``` + +### Component 2: RedoLog (Crash Recovery) + +Used only for the memory extraction phase of `session.commit`. Writes a marker before the operation, deletes it after success, and scans for leftover markers on startup to redo. + +``` +/local/_system/redo/{task_id}/redo.json +``` + +Memory extraction is idempotent — re-extracting from the same archive produces the same result. + +## Consistency Issues and Solutions + +### rm(uri) + +| Problem | Solution | +|---------|----------| +| Delete file first, then index -> file gone but index remains -> search returns non-existent file | **Reverse order**: delete index first, then file. Index deletion failure -> both file and index intact | + +**Locking strategy** (depends on target type): +- Deleting a **directory**: `lock_mode="subtree"`, locks the directory itself +- Deleting a **file**: `lock_mode="point"`, locks the file's parent directory + +Operation flow: + +``` +1. Check whether target is a directory or file, choose lock mode +2. Acquire lock +3. Delete VectorDB index -> immediately invisible to search +4. Delete FS file +5. Release lock +``` + +VectorDB deletion fails -> exception thrown, lock auto-released, file and index both intact. FS deletion fails -> VectorDB already deleted but file remains, retry is safe. + +### mv(old_uri, new_uri) + +| Problem | Solution | +|---------|----------| +| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Copy first then update index; clean up copy on failure | + +**Locking strategy** (handled automatically via `lock_mode="mv"`): +- Moving a **directory**: SUBTREE lock on both source path and destination parent +- Moving a **file**: POINT lock on both source's parent and destination parent + +Operation flow: + +``` +1. Check whether source is a directory or file, set src_is_dir +2. Acquire mv lock (internally chooses SUBTREE or POINT based on src_is_dir) +3. Copy to new location (source still intact, safe) +4. If directory, remove the lock file carried over by cp into the copy +5. Update VectorDB URIs + - Failure -> clean up copy, source and old index intact, consistent state +6. Delete source +7. Release lock +``` + +### add_resource + +| Problem | Solution | +|---------|----------| +| File moved from temp to final directory, then crash -> file exists but never searchable | Two separate paths for first-time add vs incremental update | +| Resource already on disk but rm deletes it while semantic processing / vectorization is still running -> wasted work | Lifecycle SUBTREE lock held from finalization through processing completion | + +**First-time add** (target does not exist) — handled in `ResourceProcessor.process_resource` Phase 3.5: + +``` +1. Acquire POINT lock on parent of final_uri +2. agfs.mv temp directory -> final location +3. Acquire SUBTREE lock on final_uri (inside POINT lock, eliminating race window) +4. Release POINT lock +5. Clean up temp directory +6. Enqueue SemanticMsg(lifecycle_lock_handle_id=...) -> DAG runs on final +7. DAG starts lock refresh loop (refreshes timestamp every lock_expire/2 seconds) +8. DAG complete + all embeddings done -> release SUBTREE lock +``` + +During this period, `rm` attempting to acquire a SUBTREE lock on the same path will fail with `ResourceBusyError`. + +**Incremental update** (target already exists) — temp stays in place: + +``` +1. Acquire SUBTREE lock on target_uri (protect existing resource) +2. Enqueue SemanticMsg(uri=temp, target_uri=final, lifecycle_lock_handle_id=...) +3. DAG runs on temp, lock refresh loop active +4. DAG completion triggers sync_diff_callback or move_temp_to_target_callback +5. Callback completes -> release SUBTREE lock +``` + +Note: DAG callbacks do NOT wrap operations in an outer lock. Each `VikingFS.rm` and `VikingFS.mv` has its own lock internally. An outer lock would conflict with these inner locks causing deadlock. + +**Server restart recovery**: SemanticMsg is persisted in QueueFS. On restart, `SemanticProcessor` detects that the `lifecycle_lock_handle_id` handle is missing from the in-memory LockManager and re-acquires a SUBTREE lock. + +### session.commit() + +| Problem | Solution | +|---------|----------| +| Messages cleared but archive not written -> conversation data lost | Phase 1 without lock (incomplete archive has no side effects) + Phase 2 with RedoLog | + +LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding operation. The design splits into two phases: + +``` +Phase 1 — Archive (no lock): + 1. Generate archive summary (LLM) + 2. Write archive (history/archive_N/messages.jsonl + summaries) + 3. Clear messages.jsonl + 4. Clear in-memory message list + +Phase 2 — Memory extraction + write (RedoLog): + 1. Write redo marker (archive_uri, session_uri, user identity) + 2. Extract memories from archived messages (LLM) + 3. Write current message state + 4. Write relations + 5. Directly enqueue SemanticQueue + 6. Delete redo marker +``` + +**Crash recovery analysis**: + +| Crash point | State | Recovery action | +|------------|-------|----------------| +| During Phase 1 archive write | No marker | Incomplete archive; next commit scans history/ for index, unaffected | +| Phase 1 archive complete but messages not cleared | No marker | Archive complete + messages still present = redundant but safe | +| During Phase 2 memory extraction/write | Redo marker exists | On startup: redo extraction + write + enqueue from archive | +| Phase 2 complete | Redo marker deleted | No recovery needed | + +## LockContext + +`LockContext` is an **async** context manager that encapsulates lock acquisition and release: + +```python +from openviking.storage.transaction import LockContext, get_lock_manager + +lock_manager = get_lock_manager() + +# Point lock (write operations, semantic processing) +async with LockContext(lock_manager, [path], lock_mode="point"): + # Perform operations... + pass + +# Subtree lock (delete operations) +async with LockContext(lock_manager, [path], lock_mode="subtree"): + # Perform operations... + pass + +# MV lock (move operations) +async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst): + # Perform operations... + pass +``` + +**Lock modes**: + +| lock_mode | Use case | Behavior | +|-----------|----------|----------| +| `point` | Write operations, semantic processing | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors | +| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors | +| `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) | + +**Exception handling**: `__aexit__` always releases locks and does not swallow exceptions. Lock acquisition failure raises `LockAcquisitionError`. + +## Lock Types (POINT vs SUBTREE) + +The lock mechanism uses two lock types to handle different conflict patterns: + +| | POINT on same path | SUBTREE on same path | POINT on descendant | SUBTREE on ancestor | +|---|---|---|---|---| +| **POINT** | Conflict | Conflict | — | Conflict | +| **SUBTREE** | Conflict | Conflict | Conflict | Conflict | + +- **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock. +- **SUBTREE (S)**: Used for rm and mv operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants and ancestor directories for conflicting locks. + +## Lock Mechanism + +### Lock Protocol + +Lock file path: `{path}/.path.ovlock` + +Lock file content (Fencing Token): +``` +{handle_id}:{time_ns}:{lock_type} +``` + +Where `lock_type` is `P` (POINT) or `S` (SUBTREE). + +### Lock Acquisition (POINT mode) + +``` +loop until timeout (poll interval: 200ms): + 1. Check target directory exists + 2. Check if target directory is locked by another operation + - Stale lock? -> remove and retry + - Active lock? -> wait + 3. Check all ancestor directories for SUBTREE locks + - Stale lock? -> remove and retry + - Active lock? -> wait + 4. Write POINT (P) lock file + 5. TOCTOU double-check: re-scan ancestors for SUBTREE locks + - Conflict found: compare (timestamp, handle_id) + - Later one (larger timestamp/handle_id) backs off (removes own lock) to prevent livelock + - Wait and retry + 6. Verify lock file ownership (fencing token matches) + 7. Success + +Timeout (default 0 = no-wait) raises LockAcquisitionError +``` + +### Lock Acquisition (SUBTREE mode) + +``` +loop until timeout (poll interval: 200ms): + 1. Check target directory exists + 2. Check if target directory is locked by another operation + - Stale lock? -> remove and retry + - Active lock? -> wait + 3. Check all ancestor directories for SUBTREE locks + - Stale lock? -> remove and retry + - Active lock? -> wait + 4. Scan all descendant directories for any locks by other operations + - Stale lock? -> remove and retry + - Active lock? -> wait + 5. Write SUBTREE (S) lock file (only one file, at the root path) + 6. TOCTOU double-check: re-scan descendants and ancestors + - Conflict found: compare (timestamp, handle_id) + - Later one (larger timestamp/handle_id) backs off (removes own lock) to prevent livelock + - Wait and retry + 7. Verify lock file ownership (fencing token matches) + 8. Success + +Timeout (default 0 = no-wait) raises LockAcquisitionError +``` + +### Lock Expiry Cleanup + +**Stale lock detection**: PathLock checks the fencing token timestamp. Locks older than `lock_expire` (default 300s) are considered stale and are removed automatically during acquisition. + +**In-process cleanup**: LockManager checks active LockHandles every 60 seconds. Handles created more than 3600 seconds ago are force-released. + +**Orphan locks**: Lock files left behind after a process crash are automatically removed via stale lock detection when any operation next attempts to acquire a lock on the same path. + +## Crash Recovery + +`LockManager.start()` automatically scans for leftover markers in `/local/_system/redo/` on startup: + +| Scenario | Recovery action | +|----------|----------------| +| session_memory extraction crash | Redo memory extraction + write + enqueue from archive | +| Crash while holding lock | Lock file remains in AGFS; stale detection auto-cleans on next acquisition (default 300s expiry) | +| Crash after enqueue, before worker processes | QueueFS SQLite persistence; worker auto-pulls after restart | +| Orphan index | Cleaned on L2 on-demand load | + +### Defense Summary + +| Failure scenario | Defense | Recovery timing | +|-----------------|--------|-----------------| +| Crash during operation | Lock auto-expires + stale detection | Next acquisition of same path lock | +| Crash during add_resource semantic processing | Lifecycle lock expires + SemanticProcessor re-acquires on restart | Worker restart | +| Crash during session.commit Phase 2 | RedoLog marker + redo | On restart | +| Crash after enqueue, before worker | QueueFS SQLite persistence | Worker restart | +| Orphan index | L2 on-demand load cleanup | When user accesses | + +## Configuration + +Path locks are enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section: + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0 + } + } +} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `lock_timeout` | float | Lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds. | `0.0` | +| `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` | + +### QueueFS Persistence + +The lock mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup. + +## Related Documentation + +- [Architecture](./01-architecture.md) - System architecture overview +- [Storage](./05-storage.md) - AGFS and vector store +- [Session Management](./08-session.md) - Session and memory management +- [Configuration](../guides/01-configuration.md) - Configuration reference diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index 7c2779b6..1a60fa89 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -515,7 +515,6 @@ Supports S3 storage in VirtualHostStyle mode, such as TOS. - #### vectordb Vector database storage configuration @@ -639,6 +638,28 @@ When `root_api_key` is configured, the server enables multi-tenant authenticatio For startup and deployment details see [Deployment](./03-deployment.md), for authentication see [Authentication](./04-authentication.md). +## storage.transaction Section + +Path locks are enabled by default and usually require no configuration. **The default behavior is no-wait**: if the target path is already locked by another operation, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry. + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0 + } + } +} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `lock_timeout` | float | Path lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds, then raise `LockAcquisitionError`. | `0.0` | +| `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` | + +For details on the lock mechanism, see [Path Locks and Crash Recovery](../concepts/09-transaction.md). + ## Full Schema ```json @@ -673,6 +694,10 @@ For startup and deployment details see [Deployment](./03-deployment.md), for aut "url": "string", "timeout": 10 }, + "transaction": { + "lock_timeout": 0.0, + "lock_expire": 300.0 + }, "vectordb": { "backend": "local|remote", "url": "string", diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 503ed683..45a10d63 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -1,167 +1,362 @@ -# 事务机制 +# 路径锁与崩溃恢复 -OpenViking 的事务机制为 AI Agent 上下文数据库提供可靠的操作保障,解决数据一致性、并发控制和错误恢复等核心问题。 +OpenViking 通过**路径锁**和**Redo Log** 两个简单原语保护核心写操作(`rm`、`mv`、`add_resource`、`session.commit`)的一致性,确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。 -## 概览 +## 设计哲学 +OpenViking 是上下文数据库,FS 是源数据,VectorDB 是派生索引。索引丢了可从源数据重建,源数据丢失不可恢复。因此: + +> **宁可搜不到,不要搜到坏结果。** + +## 设计原则 + +1. **写互斥**:通过路径锁保证同一路径同一时间只有一个写操作 +2. **默认生效**:所有数据操作命令自动加锁,用户无需额外配置 +3. **锁即保护**:进入 LockContext 时加锁,退出时释放,没有 undo/journal/commit 语义 +4. **仅 session_memory 需要崩溃恢复**:通过 RedoLog 在进程崩溃后重做记忆提取 +5. **Queue 操作在锁外执行**:SemanticQueue/EmbeddingQueue 的 enqueue 是幂等的,失败可重试 + +## 架构 + +``` +Service Layer (rm / mv / add_resource / session.commit) + | + v ++--[LockContext 异步上下文管理器]-------+ +| | +| 1. 创建 LockHandle | +| 2. 获取路径锁(轮询 + 超时) | +| 3. 执行操作(FS + VectorDB) | +| 4. 释放锁 | +| | +| 异常时:自动释放锁,异常原样传播 | ++---------------------------------------+ + | + v +Storage Layer (VikingFS, VectorDB, QueueManager) ``` -操作请求 → TransactionManager → 锁保护 → 执行操作 → 状态更新 - ↓ ↓ ↓ - 事务ID分配和事务状态管理 路径锁校验和加锁 +## 两个核心组件 + +### 组件 1:PathLock + LockManager + LockContext(路径锁系统) + +**PathLock** 实现基于文件的分布式锁,支持 POINT 和 SUBTREE 两种锁类型,使用 fencing token 防止 TOCTOU 竞争,自动检测并清理过期锁。 -事务生命周期:开始操作 → 创建事务 → 锁保护生效 → 文件系统同步操作 → 摘要和索引异步操作 → 移除锁保护 → 事务结束 +**LockHandle** 是轻量的锁持有者令牌: + +```python +@dataclass +class LockHandle: + id: str # 唯一标识,用于生成 fencing token + locks: list[str] # 已获取的锁文件路径 + created_at: float # 创建时间 ``` -**设计原则**: -1. **最小化锁粒度**:仅支持路径锁机制,不实现复杂的 MVCC 等 -2. **写互斥优先**:暂不实现读锁(共享锁),先承诺写操作的互斥性 -3. **渐进式扩展**:避免过度设计,聚焦核心需求,未来需要时再添加更复杂的锁机制 -4. **默认生效**:所有数据操作命令均开启事务机制,用户无需额外配置 +**LockManager** 是全局单例,管理锁生命周期: +- 创建/释放 LockHandle +- 后台清理泄漏的锁(进程内安全网) +- 启动时执行 RedoLog 恢复 -## 核心需求分析 +**LockContext** 是异步上下文管理器,封装加锁/解锁生命周期: + +```python +from openviking.storage.transaction import LockContext, get_lock_manager -OpenViking 的数据操作命令(如 `add_resource`、`rm`、`mv` 等)存在以下无保护操作问题: +async with LockContext(get_lock_manager(), [path], lock_mode="point") as handle: + # 在锁保护下执行操作 + ... +# 退出时自动释放锁(包括异常情况) +``` -1. **并发冲突**:多个用户同时操作同一目录可能导致数据不一致 -2. **无原子性**:`add_resource` 多阶段操作中,某个阶段失败可能留下中间状态 -3. **无可观测性**:操作结果无法预测,用户无法直接观察到正在操作的状态 +### 组件 2:RedoLog(崩溃恢复) -## 系统一致性要求 +仅用于 `session.commit` 的记忆提取阶段。操作前写标记,成功后删标记,启动时扫描遗留标记并重做。 -从系统分析的角度,OpenViking 要求实现组件间的分布式一致性: +``` +/local/_system/redo/{task_id}/redo.json +``` -1. **向量索引的最终一致**:所有上下文数据的向量表征依托独立的向量数据库或向量索引实现,要求确保在任何操作序列下,向量表示的更新都能实现最终一致 -2. **文件系统的读写一致性**:所有上下文数据的文件系统表示依托 VikingFS 实现,底层为 AGFS 桥接的分布式文件系统,要求确保在任何操作序列下,文件系统的更新都能保证数据不会损坏或丢失 -3. **队列和异步数据处理的一致性**:所有上下文数据的异步操作依托队列实现,要求确保在任何操作序列下,队列中的数据都能实现最终一致,即队列中的数据会最终被处理,不会丢失或重复 +Memory 提取是幂等的 — 从同一个 archive 重新提取会得到相同结果。 -## TransactionManager(事务管理器) +## 一致性问题与解决方案 -TransactionManager 是全局单例,负责管理事务生命周期和锁机制实现。 +### rm(uri) -### 核心职责 +| 问题 | 方案 | +|------|------| +| 先删文件再删索引 -> 文件已删但索引残留 -> 搜索返回不存在的文件 | **调换顺序**:先删索引再删文件。索引删除失败 -> 文件和索引都在,搜索正常 | -- 分配事务ID -- 管理事务生命周期(开始、提交、回滚) -- 提供事务的锁机制实现接口,防止死锁 +**加锁策略**(根据目标类型区分): +- 删除**目录**:`lock_mode="subtree"`,锁目录自身 +- 删除**文件**:`lock_mode="point"`,锁文件的父目录 -### 关键特性 +操作流程: ``` -路径锁 + 写互斥 = 并发冲突防护 +1. 检查目标是目录还是文件,选择锁模式 +2. 获取锁 +3. 删除 VectorDB 索引 -> 搜索立刻不可见 +4. 删除 FS 文件 +5. 释放锁 ``` -- **路径锁**:锁定目标目录,防止并发的目录级操作如目录删除、目录移动等 -- **写互斥**:同一时间只允许一个事务写操作,路径锁机制确保所有写操作的互斥性 -- **事务结束状态**:事务有明确的结束状态,包括完成、失败丢弃等 +VectorDB 删除失败 -> 直接抛异常,锁自动释放,文件和索引都在。FS 删除失败 -> VectorDB 已删但文件还在,重试即可。 + +### mv(old_uri, new_uri) -### 事务状态机 +| 问题 | 方案 | +|------|------| +| 文件移到新路径但索引指向旧路径 -> 搜索返回旧路径(不存在) | 先 copy 再更新索引,失败时清理副本 | + +**加锁策略**(通过 `lock_mode="mv"` 自动处理): +- 移动**目录**:源路径和目标父目录各加 SUBTREE 锁 +- 移动**文件**:源的父目录和目标父目录各加 POINT 锁 + +操作流程: ``` -INIT → AQUIRE → EXEC → COMMIT/FAIL → RELEASING → RELEASED +1. 检查源是目录还是文件,确定 src_is_dir +2. 获取 mv 锁(内部根据 src_is_dir 选择 SUBTREE 或 POINT) +3. Copy 到新位置(源还在,安全) +4. 如果是目录,删除副本中被 cp 带过去的锁文件 +5. 更新 VectorDB 中的 URI + - 失败 -> 清理副本,源和旧索引都在,一致状态 +6. 删除源 +7. 释放锁 ``` -**状态说明**: -- `INIT`:事务初始化完成,等待锁获取 -- `AQUIRE`:正在获取锁资源 -- `EXEC`:事务操作正在执行 -- `COMMIT/FAIL`:事务执行完成,进入最终状态 -- `RELEASING`:正在释放锁资源 -- `RELEASED`:锁资源已完全释放,事务结束 +### add_resource -### 事务记录属性 +| 问题 | 方案 | +|------|------| +| 文件从临时目录移到正式目录后崩溃 -> 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 | +| 资源已落盘但语义处理/向量化还在跑时被 rm 删除 -> 处理白跑 | 生命周期 SUBTREE 锁,从落盘持续到处理完成 | + +**首次添加**(target 不存在)— 在 `ResourceProcessor.process_resource` Phase 3.5 中处理: -```python -TransactionRecord( - id: str, # 事务ID,采用 uuid 格式,唯一标识一个事务 - locks: List[str], # 锁列表 - status: str, # 当前状态 - init_info: Dict, # 事务初始化信息 - rollback_info: Dict, # 回滚信息 - created_at: float, # 创建时间 - updated_at: float, # 更新时间 -) +``` +1. 获取 POINT 锁,锁 final_uri 的父目录 +2. agfs.mv 临时目录 -> 正式位置 +3. 获取 SUBTREE 锁,锁 final_uri(在 POINT 锁内,消除竞态窗口) +4. 释放 POINT 锁 +5. 清理临时目录 +6. 入队 SemanticMsg(lifecycle_lock_handle_id=...) -> DAG 在 final 上跑 +7. DAG 启动锁刷新循环(每 lock_expire/2 秒刷新时间戳) +8. DAG 完成 + 所有 embedding 完成 -> 释放 SUBTREE 锁 ``` -### 设计决策 +此期间 `rm` 尝试获取同路径 SUBTREE 锁会失败,抛出 `ResourceBusyError`。 -- 暂不实现共享锁(读锁),简化设计 -- 锁粒度仅限目录,不实现范围锁机制 -- 不实现复杂的死锁检测,通过超时机制防止死锁,事务超时后自动释放所有锁 -- 支持可选的自下而上并行加锁模式,提升大型目录树操作的性能和一致性 -- 事务状态机增加AQUIRE+RELEASING状态,明确跟踪锁释放过程,提高系统可观测性 +**增量更新**(target 已存在)— temp 保持不动: -## 锁机制 +``` +1. 获取 SUBTREE 锁,锁 target_uri(保护已有资源) +2. 入队 SemanticMsg(uri=temp, target_uri=final, lifecycle_lock_handle_id=...) +3. DAG 在 temp 上跑,启动锁刷新循环 +4. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback +5. callback 执行完毕 -> 释放 SUBTREE 锁 +``` -锁机制是事务管理的核心组件,当前只提供路径锁类型。 +注意:DAG callback 不在外层加锁。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立锁保护。外层锁会与内部锁冲突导致死锁。 -### 锁类型 +**服务重启恢复**:SemanticMsg 持久化在 QueueFS 中。重启后 `SemanticProcessor` 发现 `lifecycle_lock_handle_id` 对应的 handle 不在内存中,会重新获取 SUBTREE 锁。 -| 锁类型 | 作用范围 | 用例 | -|--------|----------|------| -| 路径锁 | 整个目录 | 用于阻止目录被意外整体移动或删除,确保事务操作过程的路径合法性 +### session.commit() -### 锁协议 +| 问题 | 方案 | +|------|------| +| 消息已清空但 archive 未写入 -> 对话数据丢失 | Phase 1 无锁(archive 不完整无副作用)+ Phase 2 RedoLog | + +LLM 调用耗时不可控(5s~60s+),不能放在持锁操作内。设计拆为两个阶段: ``` -viking://resources/github/volcengine/OpenViking/.path.ovlock +Phase 1 — 归档(无锁): + 1. 生成归档摘要(LLM) + 2. 写 archive(history/archive_N/messages.jsonl + 摘要) + 3. 清空 messages.jsonl + 4. 清空内存中的消息列表 + +Phase 2 — 记忆提取 + 写入(RedoLog): + 1. 写 redo 标记(archive_uri、session_uri、用户身份信息) + 2. 从归档消息提取 memories(LLM) + 3. 写当前消息状态 + 4. 写 relations + 5. 直接 enqueue SemanticQueue + 6. 删除 redo 标记 ``` -- 锁文件存在即表示已加锁 -- 文件内容为事务ID,用于标识当前事务 -- 事务操作完成后,删除锁文件以释放锁 +**崩溃恢复分析**: -### 加锁流程 +| 崩溃时间点 | 状态 | 恢复动作 | +|-----------|------|---------| +| Phase 1 写 archive 中途 | 无标记 | archive 不完整,下次 commit 从 history/ 扫描 index,不受影响 | +| Phase 1 archive 完成但 messages 未清空 | 无标记 | archive 完整 + messages 仍在 = 数据冗余但安全 | +| Phase 2 记忆提取/写入中途 | redo 标记存在 | 启动恢复:从 archive 重做提取+写入+入队 | +| Phase 2 完成 | redo 标记已删 | 无需恢复 | -#### 普通操作加锁流程 +## LockContext +`LockContext` 是**异步**上下文管理器,封装锁的获取和释放: + +```python +from openviking.storage.transaction import LockContext, get_lock_manager + +lock_manager = get_lock_manager() + +# Point 锁(写操作、语义处理) +async with LockContext(lock_manager, [path], lock_mode="point"): + # 执行操作... + pass + +# Subtree 锁(删除操作) +async with LockContext(lock_manager, [path], lock_mode="subtree"): + # 执行操作... + pass + +# MV 锁(移动操作) +async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst): + # 执行操作... + pass ``` -1. 检查目标目录是否存在 -2. 检查目标目录是否已被其他事务锁定 -3. 检查目标目录的父目录是否已被其他事务锁定 -4. 创建 .path.ovlock 文件,文件内容为事务ID -5. 再次检查目标目录的父目录是否已被其他事务锁定 -6. 读取刚创建的 .path.ovlock 文件内容,确认为当前事务ID -7. 一切正常,则返回加锁成功 -``` -#### rm 操作加锁流程 +**锁模式**: + +| lock_mode | 用途 | 行为 | +|-----------|------|------| +| `point` | 写操作、语义处理 | 锁定指定路径;与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 | +| `subtree` | 删除操作 | 锁定子树根节点;与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 | +| `mv` | 移动操作 | 目录移动:源和目标均加 SUBTREE 锁;文件移动:源父目录和目标均加 POINT 锁(通过 `src_is_dir` 控制) | + +**异常处理**:`__aexit__` 总是释放锁,不吞异常。获取锁失败时抛出 `LockAcquisitionError`。 + +## 锁类型(POINT vs SUBTREE) + +锁机制使用两种锁类型来处理不同的冲突场景: + +| | 同路径 POINT | 同路径 SUBTREE | 后代 POINT | 祖先 SUBTREE | +|---|---|---|---|---| +| **POINT** | 冲突 | 冲突 | — | 冲突 | +| **SUBTREE** | 冲突 | 冲突 | 冲突 | 冲突 | + +- **POINT (P)**:用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。 +- **SUBTREE (S)**:用于删除和移动操作。逻辑上覆盖整个子树,但只在根目录写**一个锁文件**。获取前扫描所有后代和祖先目录确认无冲突锁。 +## 锁机制 + +### 锁协议 + +锁文件路径:`{path}/.path.ovlock` + +锁文件内容(Fencing Token): +``` +{handle_id}:{time_ns}:{lock_type} ``` -# 传统串行模式:存在更大的竞态条件窗口 -1. 检查目标目录是否存在 -2. 检查目标目录是否已被其他事务锁定 -3. 检查目标目录的父目录是否已被其他事务锁定 -4. 在目标目录下创建 .path.ovlock 文件,文件内容为事务ID -5. 递归地在目标目录的所有子目录下创建 .path.ovlock 文件 -6. 如果发生加锁失败,移除所有已经创建的 .path.ovlock 文件 -7. 一切正常,则返回加锁成功 -# 自下而上并行模式 -1. 并行遍历整个目录树,收集所有子目录路径 -2. 按照目录层级从深到浅排序,从最深层子目录开始 -3. 以有限并行度(默认最大8)批量创建 .path.ovlock 文件 -4. 最后锁定目标目录 -5. 如果任一位置加锁失败,逆序移除所有已经创建的 .path.ovlock 文件 +其中 `lock_type` 为 `P`(POINT)或 `S`(SUBTREE)。 + +### 获取锁流程(POINT 模式) + +``` +循环直到超时(轮询间隔:200ms): + 1. 检查目标目录存在 + 2. 检查目标路径是否被其他操作锁定 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 + 3. 检查所有祖先目录是否有 SUBTREE 锁 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 + 4. 写入 POINT (P) 锁文件 + 5. TOCTOU 双重检查:重新扫描祖先目录的 SUBTREE 锁 + - 发现冲突:比较 (timestamp, handle_id) + - 后到者(更大的 timestamp/handle_id)主动让步(删除自己的锁),防止活锁 + - 等待后重试 + 6. 验证锁文件归属(fencing token 匹配) + 7. 成功 + +超时(默认 0 = 不等待)抛出 LockAcquisitionError ``` -#### mv 操作加锁流程 +### 获取锁流程(SUBTREE 模式) ``` -1. 先参照 rm 操作对原目录进行加锁 -2. 再参照普通操作过程对新目录进行加锁 +循环直到超时(轮询间隔:200ms): + 1. 检查目标目录存在 + 2. 检查目标路径是否被其他操作锁定 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 + 3. 检查所有祖先目录是否有 SUBTREE 锁 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 + 4. 扫描所有后代目录,检查是否有其他操作持有的锁 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 + 5. 写入 SUBTREE (S) 锁文件(只写一个文件,在根路径) + 6. TOCTOU 双重检查:重新扫描后代目录和祖先目录 + - 发现冲突:比较 (timestamp, handle_id) + - 后到者(更大的 timestamp/handle_id)主动让步(删除自己的锁),防止活锁 + - 等待后重试 + 7. 验证锁文件归属(fencing token 匹配) + 8. 成功 + +超时(默认 0 = 不等待)抛出 LockAcquisitionError ``` -### 锁机制性能分析 +### 锁过期清理 + +**陈旧锁检测**:PathLock 检查 fencing token 中的时间戳。超过 `lock_expire`(默认 300s)的锁被视为陈旧锁,在加锁过程中自动移除。 + +**进程内清理**:LockManager 每 60 秒检查活跃的 LockHandle,创建超过 3600 秒的 handle 强制释放。 + +**孤儿锁**:进程崩溃后遗留的锁文件,在下次任何操作尝试获取同一路径锁时,通过 stale lock 检测自动移除。 + +## 崩溃恢复 + +`LockManager.start()` 启动时自动扫描 `/local/_system/redo/` 目录中的遗留标记: + +| 场景 | 恢复方式 | +|------|---------| +| session_memory 提取中途崩溃 | 从 archive 重做记忆提取 + 写入 + enqueue | +| 锁持有期间崩溃 | 锁文件留在 AGFS,下次获取时 stale 检测自动清理(默认 300s 过期)| +| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化,worker 重启后自动拉取 | +| 孤儿索引 | L2 按需加载时清理 | + +### 防线总结 + +| 异常场景 | 防线 | 恢复时机 | +|---------|------|---------| +| 操作中途崩溃 | 锁自动过期 + stale 检测 | 下次获取同路径锁时 | +| add_resource 语义处理中途崩溃 | 生命周期锁过期 + SemanticProcessor 重启时重新获取 | worker 重启后 | +| session.commit Phase 2 崩溃 | RedoLog 标记 + 重做 | 重启时 | +| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后 | +| 孤儿索引 | L2 按需加载时清理 | 用户访问时 | + +## 配置 + +路径锁默认启用,无需额外配置。**默认不等待**:若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试,可通过 `storage.transaction` 段配置: + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0 + } + } +} +``` + +| 参数 | 类型 | 说明 | 默认值 | +|------|------|------|--------| +| `lock_timeout` | float | 获取锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间 | `0.0` | +| `lock_expire` | float | 锁过期时间(秒),超过此时间的锁将被视为陈旧锁并强制释放 | `300.0` | + +### QueueFS 持久化 -- 并行遍历采用广度优先策略,同时处理同一层级的所有目录 -- 并行加锁从最深层开始,逐层向上锁定,确保整个目录树的一致性 -- 有限并行度(默认最大8)避免AGFS服务过载 -- 加锁失败时采用逆序回滚,确保所有已加锁目录都能正确释放 -- 事务状态机明确区分锁管理过程(AQUIRE+RELEASING状态),提高系统可观测性和调试效率 +路径锁机制依赖 QueueFS 使用 SQLite 后端,确保 enqueue 的任务在进程重启后可恢复。这是默认配置,无需手动设置。 ## 相关文档 - [架构概述](./01-architecture.md) - 系统整体架构 - [存储架构](./05-storage.md) - AGFS 和向量库 -- [会话管理](./08-session.md) - 会话和记忆管理 \ No newline at end of file +- [会话管理](./08-session.md) - 会话和记忆管理 +- [配置](../guides/01-configuration.md) - 配置文件说明 diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index 889c737a..b0954bbf 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -489,10 +489,9 @@ AST 提取支持:Python、JavaScript/TypeScript、Rust、Go、Java、C/C++。 - #### vectordb -向量库存储的配置 +向量库存储的配置 | 参数 | 类型 | 说明 | 默认值 | |------|------|------|--------| @@ -614,6 +613,28 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 启动方式和部署详情见 [服务部署](./03-deployment.md),认证详情见 [认证](./04-authentication.md)。 +## storage.transaction 段 + +路径锁默认启用,通常无需配置。**默认行为是不等待**:若目标路径已被其他操作锁定,操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试,请将 `lock_timeout` 设为正数。 + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0 + } + } +} +``` + +| 参数 | 类型 | 说明 | 默认值 | +|------|------|------|--------| +| `lock_timeout` | float | 获取路径锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间后抛出 `LockAcquisitionError` | `0.0` | +| `lock_expire` | float | 锁过期时间(秒)。超过此时间的锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` | + +路径锁机制的详细说明见 [路径锁与崩溃恢复](../concepts/09-transaction.md)。 + ## 完整 Schema ```json @@ -648,6 +669,10 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 "url": "string", "timeout": 10 }, + "transaction": { + "lock_timeout": 0.0, + "lock_expire": 300.0 + }, "vectordb": { "backend": "local|remote", "url": "string", diff --git a/openviking/agfs_manager.py b/openviking/agfs_manager.py index 14ed124a..9ae796f2 100644 --- a/openviking/agfs_manager.py +++ b/openviking/agfs_manager.py @@ -133,9 +133,23 @@ def _generate_config(self) -> Path: "version": "1.0.0", }, }, + # TODO(multi-node): SQLite backend is single-node only. Each AGFS instance + # gets its own isolated queue.db under its own data_path, so messages + # enqueued on node A are invisible to node B. For multi-node deployments, + # switch backend to "tidb" or "mysql" so all nodes share the same queue. + # + # Additionally, the TiDB backend currently uses immediate soft-delete on + # Dequeue (no two-phase status='processing' transition), meaning there is + # no at-least-once guarantee: a worker crash loses the in-flight message. + # The TiDB backend's Ack() and RecoverStale() are both no-ops and must be + # implemented before it can be used safely in production. "queuefs": { "enabled": True, "path": "/queue", + "config": { + "backend": "sqlite", + "db_path": str(self.data_path / "_system" / "queue" / "queue.db"), + }, }, }, } @@ -196,6 +210,7 @@ def start(self) -> None: self._check_port_available() self.vikingfs_path.mkdir(parents=True, exist_ok=True) + (self.data_path / "_system" / "queue").mkdir(parents=True, exist_ok=True) # NOTICE: should use viking://temp/ instead of self.vikingfs_path / "temp" # Create temp directory for Parser use # (self.vikingfs_path / "temp").mkdir(exist_ok=True) diff --git a/openviking/async_client.py b/openviking/async_client.py index 294d15da..b87b05ff 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -97,6 +97,11 @@ async def reset(cls) -> None: await cls._instance.close() cls._instance = None + # Also reset lock manager singleton + from openviking.storage.transaction import reset_lock_manager + + reset_lock_manager() + # ============= Session methods ============= def session(self, session_id: Optional[str] = None, must_exist: bool = False) -> Session: diff --git a/openviking/core/building_tree.py b/openviking/core/building_tree.py index 9685a56d..43207e29 100644 --- a/openviking/core/building_tree.py +++ b/openviking/core/building_tree.py @@ -28,6 +28,7 @@ def __init__( self._contexts: List["Context"] = [] self._uri_map: Dict[str, "Context"] = {} self._root_uri: Optional[str] = None + self._candidate_uri: Optional[str] = None def add_context(self, context: "Context") -> None: """Add a context to the tree.""" diff --git a/openviking/eval/ragas/__init__.py b/openviking/eval/ragas/__init__.py index 03336bc7..df295210 100644 --- a/openviking/eval/ragas/__init__.py +++ b/openviking/eval/ragas/__init__.py @@ -111,8 +111,8 @@ def _create_ragas_llm_from_config() -> Optional[Any]: RAGAS LLM instance or None if VLM is not configured. """ try: - from openai import OpenAI - from ragas.llms import llm_factory + from langchain_openai import ChatOpenAI + from ragas.llms import LangchainLLMWrapper except ImportError: return None @@ -124,11 +124,12 @@ def _create_ragas_llm_from_config() -> Optional[Any]: logger.info(f"Using RAGAS LLM from environment: model={model_name}, base_url={api_base}") - client = OpenAI( + openai_model = ChatOpenAI( + model=model_name, api_key=api_key, base_url=api_base, ) - return llm_factory(model_name, client=client) + return LangchainLLMWrapper(openai_model) try: from openviking_cli.utils.config import get_openviking_config @@ -151,13 +152,13 @@ def _create_ragas_llm_from_config() -> Optional[Any]: ) return None - client = OpenAI( + model_name = vlm_config.model or "gpt-4o-mini" + openai_model = ChatOpenAI( + model=model_name, api_key=vlm_config.api_key, base_url=vlm_config.api_base, ) - - model_name = vlm_config.model or "gpt-4o-mini" - return llm_factory(model_name, client=client) + return LangchainLLMWrapper(openai_model) class RagasEvaluator(BaseEvaluator): diff --git a/openviking/models/embedder/jina_embedders.py b/openviking/models/embedder/jina_embedders.py index 9f1c87cf..3ab3fade 100644 --- a/openviking/models/embedder/jina_embedders.py +++ b/openviking/models/embedder/jina_embedders.py @@ -60,6 +60,7 @@ def __init__( document_param: Optional[str] = "retrieval.passage", late_chunking: Optional[bool] = None, config: Optional[Dict[str, Any]] = None, + task: Optional[str] = None, ): """Initialize Jina AI Dense Embedder diff --git a/openviking/models/embedder/openai_embedders.py b/openviking/models/embedder/openai_embedders.py index ab2632c1..2924b98d 100644 --- a/openviking/models/embedder/openai_embedders.py +++ b/openviking/models/embedder/openai_embedders.py @@ -72,6 +72,7 @@ def __init__( config: Optional[Dict[str, Any]] = None, max_tokens: Optional[int] = None, extra_headers: Optional[Dict[str, str]] = None, + input_type: Optional[str] = None, ): """Initialize OpenAI-Compatible Dense Embedder @@ -115,7 +116,7 @@ def __init__( # Allow missing api_key when api_base is set (e.g. local OpenAI-compatible servers) if not self.api_key and not self.api_base: - raise ValueError("api_key is required (or set api_base for local servers)") + raise ValueError("api_key is required") # Initialize OpenAI client # Use a placeholder api_key when not provided (for local OpenAI-compatible servers) diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 4260065e..deea5efc 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -21,7 +21,7 @@ """ import logging -from typing import TYPE_CHECKING, Optional +from typing import Optional from openviking.core.building_tree import BuildingTree from openviking.core.context import Context @@ -31,9 +31,6 @@ from openviking.utils import parse_code_hosting_url from openviking_cli.utils.uri import VikingURI -if TYPE_CHECKING: - pass - logger = logging.getLogger(__name__) @@ -78,6 +75,31 @@ def _get_base_uri( # Agent scope return "viking://agent" + async def _resolve_unique_uri(self, uri: str, max_attempts: int = 100) -> str: + """Return a URI that does not collide with an existing resource. + + If *uri* is free, return it unchanged. Otherwise append ``_1``, + ``_2``, ... until a free name is found. + """ + viking_fs = get_viking_fs() + + async def _exists(u: str) -> bool: + try: + await viking_fs.stat(u) + return True + except Exception: + return False + + if not await _exists(uri): + return uri + + for i in range(1, max_attempts + 1): + candidate = f"{uri}_{i}" + if not await _exists(candidate): + return candidate + + raise FileExistsError(f"Cannot resolve unique name for {uri} after {max_attempts} attempts") + # ============================================================================ # v5.0 Methods (temporary directory + SemanticQueue architecture) # ============================================================================ @@ -145,13 +167,18 @@ async def finalize_from_temp( raise ValueError(f"Parent URI is not a directory: {parent_uri}") candidate_uri = VikingURI(base_uri).join(final_doc_name).uri - final_uri = candidate_uri + if to_uri: + final_uri = candidate_uri + else: + final_uri = await self._resolve_unique_uri(candidate_uri) tree = BuildingTree( source_path=source_path, source_format=source_format, ) tree._root_uri = final_uri + if not to_uri: + tree._candidate_uri = candidate_uri # Create a minimal Context object for the root so that tree.root is not None root_context = Context(uri=final_uri, temp_uri=temp_doc_uri) diff --git a/openviking/server/app.py b/openviking/server/app.py index c22794e1..c553c6ff 100644 --- a/openviking/server/app.py +++ b/openviking/server/app.py @@ -59,7 +59,8 @@ def create_app( async def lifespan(app: FastAPI): """Application lifespan handler.""" nonlocal service - if service is None: + owns_service = service is None + if owns_service: service = OpenVikingService() await service.initialize() logger.info("OpenVikingService initialized") @@ -93,7 +94,7 @@ async def lifespan(app: FastAPI): # Cleanup task_tracker.stop_cleanup_loop() - if service: + if owns_service and service: await service.close() logger.info("OpenVikingService closed") diff --git a/openviking/server/routers/content.py b/openviking/server/routers/content.py index 9b4d0279..2463cc08 100644 --- a/openviking/server/routers/content.py +++ b/openviking/server/routers/content.py @@ -102,7 +102,7 @@ async def reindex( database. If regenerate=True, also regenerates L0/L1 summaries via LLM before re-embedding. - Uses transaction locking to prevent concurrent reindexes on the same URI. + Uses path locking to prevent concurrent reindexes on the same URI. Set wait=False to run in the background and track progress via task API. """ from openviking.service.task_tracker import get_task_tracker @@ -164,27 +164,17 @@ async def _do_reindex( regenerate: bool, ctx: RequestContext, ) -> dict: - """Execute reindex within a transaction.""" - from openviking.storage.transaction import get_transaction_manager + """Execute reindex within a lock scope.""" + from openviking.storage.transaction import LockContext, get_lock_manager - tm = get_transaction_manager() - tx = tm.create_transaction(init_info={"uri": uri, "regenerate": regenerate}) - await tm.begin(tx.id) + viking_fs = service.viking_fs + path = viking_fs._uri_to_path(uri, ctx=ctx) - try: - await tm.acquire_lock_normal(tx.id, uri) + async with LockContext(get_lock_manager(), [path], lock_mode="point"): if regenerate: - result = await service.resources.summarize([uri], ctx=ctx) + return await service.resources.summarize([uri], ctx=ctx) else: - result = await service.resources.build_index([uri], ctx=ctx) - await tm.commit(tx.id) - return result - except Exception: - try: - await tm.rollback(tx.id) - except Exception: - pass - raise + return await service.resources.build_index([uri], ctx=ctx) async def _background_reindex_tracked( diff --git a/openviking/server/routers/observer.py b/openviking/server/routers/observer.py index 4d214cbf..e1910596 100644 --- a/openviking/server/routers/observer.py +++ b/openviking/server/routers/observer.py @@ -72,13 +72,13 @@ async def observer_vlm( return Response(status="ok", result=_component_to_dict(component)) -@router.get("/transaction") -async def observer_transaction( +@router.get("/lock") +async def observer_lock( _ctx: RequestContext = Depends(get_request_context), ): - """Get transaction system status.""" + """Get lock system status.""" service = get_service() - component = service.debug.observer.transaction + component = service.debug.observer.lock return Response(status="ok", result=_component_to_dict(component)) diff --git a/openviking/service/core.py b/openviking/service/core.py index c4ab6ae0..d25830ea 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -23,7 +23,7 @@ from openviking.storage import VikingDBManager from openviking.storage.collection_schemas import init_context_collection from openviking.storage.queuefs.queue_manager import QueueManager, init_queue_manager -from openviking.storage.transaction import TransactionManager, init_transaction_manager +from openviking.storage.transaction import LockManager, init_lock_manager from openviking.storage.viking_fs import VikingFS, init_viking_fs from openviking.utils.resource_processor import ResourceProcessor from openviking.utils.skill_processor import SkillProcessor @@ -75,7 +75,7 @@ def __init__( self._resource_processor: Optional[ResourceProcessor] = None self._skill_processor: Optional[SkillProcessor] = None self._session_compressor: Optional[SessionCompressor] = None - self._transaction_manager: Optional[TransactionManager] = None + self._lock_manager: Optional[LockManager] = None self._directory_initializer: Optional[DirectoryInitializer] = None # Sub-services @@ -136,14 +136,21 @@ def _init_storage( vectordb_config=config.vectordb, queue_manager=self._queue_manager ) - # Configure queues if QueueManager is available + # Configure queues if QueueManager is available. + # Workers are NOT started here — start() is called after VikingFS is initialized + # in initialize(), so that recovered tasks don't race against VikingFS init. if self._queue_manager: - self._queue_manager.setup_standard_queues(self._vikingdb_manager) + self._queue_manager.setup_standard_queues(self._vikingdb_manager, start=False) - # Initialize TransactionManager (fail-fast if AGFS missing) + # Initialize LockManager (fail-fast if AGFS missing) if self._agfs_client is None: - raise RuntimeError("AGFS client not initialized for TransactionManager") - self._transaction_manager = init_transaction_manager(agfs=self._agfs_client) + raise RuntimeError("AGFS client not initialized for LockManager") + tx_cfg = config.transaction + self._lock_manager = init_lock_manager( + agfs=self._agfs_client, + lock_timeout=tx_cfg.lock_timeout, + lock_expire=tx_cfg.lock_expire, + ) @property def _agfs(self) -> Any: @@ -161,9 +168,9 @@ def vikingdb_manager(self) -> Optional[VikingDBManager]: return self._vikingdb_manager @property - def transaction_manager(self) -> Optional[TransactionManager]: - """Get TransactionManager instance.""" - return self._transaction_manager + def lock_manager(self) -> Optional[LockManager]: + """Get LockManager instance.""" + return self._lock_manager @property def session_compressor(self) -> Optional[SessionCompressor]: @@ -257,6 +264,14 @@ async def initialize(self) -> None: if enable_recorder: logger.info("VikingFS IO Recorder enabled") + # Start queue workers now that VikingFS is ready. + # Doing it here (rather than in _init_storage) ensures that any tasks + # recovered from a previous crash are not processed before VikingFS is + # initialized, which would cause "VikingFS not initialized" errors. + if self._queue_manager: + self._queue_manager.start() + logger.info("QueueManager workers started") + # Initialize directories directory_initializer = DirectoryInitializer(vikingdb=self._vikingdb_manager) self._directory_initializer = directory_initializer @@ -276,10 +291,10 @@ async def initialize(self) -> None: self._skill_processor = SkillProcessor(vikingdb=self._vikingdb_manager) self._session_compressor = SessionCompressor(vikingdb=self._vikingdb_manager) - # Start TransactionManager if initialized - if self._transaction_manager: - await self._transaction_manager.start() - logger.info("TransactionManager started") + # Start LockManager if initialized + if self._lock_manager: + await self._lock_manager.start() + logger.info("LockManager started") # Wire up sub-services self._fs_service.set_viking_fs(self._viking_fs) @@ -307,9 +322,9 @@ async def initialize(self) -> None: async def close(self) -> None: """Close OpenViking and release resources.""" - if self._transaction_manager: - self._transaction_manager.stop() - self._transaction_manager = None + if self._lock_manager: + await self._lock_manager.stop() + self._lock_manager = None if self._vikingdb_manager: self._vikingdb_manager.mark_closing() diff --git a/openviking/service/debug_service.py b/openviking/service/debug_service.py index 9c3cf39b..b99b4e73 100644 --- a/openviking/service/debug_service.py +++ b/openviking/service/debug_service.py @@ -9,14 +9,14 @@ from openviking.storage import VikingDBManager from openviking.storage.observers import ( + LockObserver, QueueObserver, RetrievalObserver, - TransactionObserver, VikingDBObserver, VLMObserver, ) from openviking.storage.queuefs import get_queue_manager -from openviking.storage.transaction import get_transaction_manager +from openviking.storage.transaction import get_lock_manager from openviking_cli.utils.config import OpenVikingConfig @@ -136,19 +136,20 @@ def vlm(self) -> ComponentStatus: ) @property - def transaction(self) -> ComponentStatus: - """Get transaction status.""" - transaction_manager = get_transaction_manager() - if transaction_manager is None: + def lock(self) -> ComponentStatus: + """Get lock system status.""" + try: + lock_manager = get_lock_manager() + except Exception: return ComponentStatus( - name="transaction", + name="lock", is_healthy=False, has_errors=True, - status="Transaction manager not initialized.", + status="Not initialized", ) - observer = TransactionObserver(transaction_manager) + observer = LockObserver(lock_manager) return ComponentStatus( - name="transaction", + name="lock", is_healthy=observer.is_healthy(), has_errors=observer.has_errors(), status=observer.get_status_table(), @@ -172,7 +173,7 @@ def system(self) -> SystemStatus: "queue": self.queue, "vikingdb": self.vikingdb, "vlm": self.vlm, - "transaction": self.transaction, + "lock": self.lock, "retrieval": self.retrieval, } errors = [f"{c.name} has errors" for c in components.values() if c.has_errors] diff --git a/openviking/session/session.py b/openviking/session/session.py index 55418027..bdb6500b 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -220,85 +220,19 @@ def update_tool_part( self._update_message_in_jsonl() def commit(self) -> Dict[str, Any]: - """Commit session: create archive, extract memories, persist.""" - result = { - "session_id": self.session_id, - "status": "committed", - "memories_extracted": 0, - "active_count_updated": 0, - "archived": False, - "stats": None, - } - if not self._messages: - get_current_telemetry().set("memory.extracted", 0) - return result - - # 1. Archive current messages - self._compression.compression_index += 1 - messages_to_archive = self._messages.copy() - - summary = self._generate_archive_summary(messages_to_archive) - archive_abstract = self._extract_abstract_from_summary(summary) - archive_overview = summary + """Sync wrapper for commit_async().""" + return run_async(self.commit_async()) - self._write_archive( - index=self._compression.compression_index, - messages=messages_to_archive, - abstract=archive_abstract, - overview=archive_overview, - ) - - self._compression.original_count += len(messages_to_archive) - result["archived"] = True - - self._messages.clear() - logger.info( - f"Archived: {len(messages_to_archive)} messages → history/archive_{self._compression.compression_index:03d}/" - ) - - # 2. Extract long-term memories - if self._session_compressor: - logger.info( - f"Starting memory extraction from {len(messages_to_archive)} archived messages" - ) - memories = run_async( - self._session_compressor.extract_long_term_memories( - messages=messages_to_archive, - user=self.user, - session_id=self.session_id, - ctx=self.ctx, - ) - ) - logger.info(f"Extracted {len(memories)} memories") - result["memories_extracted"] = len(memories) - self._stats.memories_extracted += len(memories) - get_current_telemetry().set("memory.extracted", len(memories)) - - # 3. Write current messages to AGFS - self._write_to_agfs(self._messages) - - # 4. Create relations - self._write_relations() - - # 5. Update active_count - active_count_updated = self._update_active_counts() - result["active_count_updated"] = active_count_updated + async def commit_async(self) -> Dict[str, Any]: + """Async commit session: two-phase approach. - # 6. Update statistics - self._stats.compression_count = self._compression.compression_index - result["stats"] = { - "total_turns": self._stats.total_turns, - "contexts_used": self._stats.contexts_used, - "skills_used": self._stats.skills_used, - "memories_extracted": self._stats.memories_extracted, - } + Phase 1 (Archive): Write archive, clear messages. + Phase 2 (Memory, redo-log protected): Extract memories, write, enqueue. + """ + import uuid - self._stats.total_tokens = 0 - logger.info(f"Session {self.session_id} committed") - return result + from openviking.storage.transaction import get_lock_manager - async def commit_async(self) -> Dict[str, Any]: - """Async commit session: create archive, extract memories, persist.""" result = { "session_id": self.session_id, "status": "committed", @@ -311,7 +245,7 @@ async def commit_async(self) -> Dict[str, Any]: get_current_telemetry().set("memory.extracted", 0) return result - # 1. Archive current messages + # ===== Preparation ===== self._compression.compression_index += 1 messages_to_archive = self._messages.copy() @@ -319,22 +253,41 @@ async def commit_async(self) -> Dict[str, Any]: archive_abstract = self._extract_abstract_from_summary(summary) archive_overview = summary + # ===== Phase 1: Archive (no lock) ===== + archive_uri = ( + f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}" + ) await self._write_archive_async( index=self._compression.compression_index, messages=messages_to_archive, abstract=archive_abstract, overview=archive_overview, ) + await self._write_to_agfs_async(messages=[]) + self._messages.clear() self._compression.original_count += len(messages_to_archive) result["archived"] = True - - self._messages.clear() logger.info( - f"Archived: {len(messages_to_archive)} messages → history/archive_{self._compression.compression_index:03d}/" + f"Archived: {len(messages_to_archive)} messages → " + f"history/archive_{self._compression.compression_index:03d}/" + ) + + # ===== Phase 2: Memory extraction + write (redo-log protected) ===== + redo_log = get_lock_manager().redo_log + task_id = str(uuid.uuid4()) + redo_log.write_pending( + task_id, + { + "archive_uri": archive_uri, + "session_uri": self._session_uri, + "account_id": self.ctx.account_id, + "user_id": self.ctx.user.user_id, + "agent_id": self.ctx.user.agent_id, + "role": self.ctx.role.value, + }, ) - # 2. Extract long-term memories if self._session_compressor: logger.info( f"Starting memory extraction from {len(messages_to_archive)} archived messages" @@ -350,17 +303,33 @@ async def commit_async(self) -> Dict[str, Any]: self._stats.memories_extracted += len(memories) get_current_telemetry().set("memory.extracted", len(memories)) - # 3. Write current messages to AGFS await self._write_to_agfs_async(self._messages) - - # 4. Create relations await self._write_relations_async() - # 5. Update active_count + # Enqueue semantic processing directly + from openviking.storage.queuefs import get_queue_manager + from openviking.storage.queuefs.semantic_msg import SemanticMsg + + queue_manager = get_queue_manager() + if queue_manager: + msg = SemanticMsg( + uri=self._session_uri, + context_type="memory", + account_id=self.ctx.account_id, + user_id=self.ctx.user.user_id, + agent_id=self.ctx.user.agent_id, + role=self.ctx.role.value, + ) + semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC) + await semantic_queue.enqueue(msg) + + redo_log.mark_done(task_id) + + # Update active_count active_count_updated = await self._update_active_counts_async() result["active_count_updated"] = active_count_updated - # 6. Update statistics + # Update statistics self._stats.compression_count = self._compression.compression_index result["stats"] = { "total_turns": self._stats.total_turns, diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py index bc3e36be..def786be 100644 --- a/openviking/storage/errors.py +++ b/openviking/storage/errors.py @@ -29,3 +29,15 @@ class ConnectionError(StorageException): class SchemaError(StorageException): """Raised when schema validation fails.""" + + +class LockError(VikingDBException): + """Raised when a lock operation fails.""" + + +class LockAcquisitionError(LockError): + """Raised when lock acquisition fails.""" + + +class ResourceBusyError(LockError): + """Raised when a resource is locked by an ongoing operation (e.g. semantic processing).""" diff --git a/openviking/storage/local_fs.py b/openviking/storage/local_fs.py index c4683dd7..3d23566d 100644 --- a/openviking/storage/local_fs.py +++ b/openviking/storage/local_fs.py @@ -11,6 +11,7 @@ from openviking.server.identity import RequestContext from openviking.storage.queuefs import EmbeddingQueue, get_queue_manager from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter +from openviking_cli.exceptions import NotFoundError from openviking_cli.utils.logger import get_logger from openviking_cli.utils.uri import VikingURI @@ -178,7 +179,7 @@ async def import_ovpack( f"Resource already exists at {root_uri}. Use force=True to overwrite." ) logger.info(f"[local_fs] Overwriting existing resource at {root_uri}") - except FileNotFoundError: + except NotFoundError: # Path does not exist, safe to import pass @@ -204,9 +205,10 @@ async def import_ovpack( if not zip_path: continue - # Normalize path separators to handle Windows-created ZIPs - zip_path = zip_path.replace("\\", "/") + # Validate before normalization so backslash paths are rejected safe_zip_path = _validate_ovpack_member_path(zip_path, base_name) + # Normalize path separators to handle Windows-created ZIPs + safe_zip_path = safe_zip_path.replace("\\", "/") # Handle directory entries if safe_zip_path.endswith("/"): diff --git a/openviking/storage/observers/__init__.py b/openviking/storage/observers/__init__.py index 4a36700a..dae5aed3 100644 --- a/openviking/storage/observers/__init__.py +++ b/openviking/storage/observers/__init__.py @@ -1,17 +1,17 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 from .base_observer import BaseObserver +from .lock_observer import LockObserver from .queue_observer import QueueObserver from .retrieval_observer import RetrievalObserver -from .transaction_observer import TransactionObserver from .vikingdb_observer import VikingDBObserver from .vlm_observer import VLMObserver __all__ = [ "BaseObserver", + "LockObserver", "QueueObserver", "RetrievalObserver", - "TransactionObserver", "VikingDBObserver", "VLMObserver", ] diff --git a/openviking/storage/observers/lock_observer.py b/openviking/storage/observers/lock_observer.py new file mode 100644 index 00000000..92521790 --- /dev/null +++ b/openviking/storage/observers/lock_observer.py @@ -0,0 +1,71 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""LockObserver: Lock system observability.""" + +import time +from typing import Any, Dict, List + +from openviking.storage.observers.base_observer import BaseObserver +from openviking.storage.transaction.lock_manager import LockManager +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class LockObserver(BaseObserver): + """Observability tool for the lock system.""" + + def __init__(self, lock_manager: LockManager): + self._manager = lock_manager + + def get_active_locks(self) -> List[Dict[str, Any]]: + """Return info about every active lock handle.""" + now = time.time() + return [ + { + "id": h.id, + "lock_count": len(h.locks), + "created_at": h.created_at, + "duration_seconds": round(now - h.created_at, 1), + } + for h in self._manager.get_active_handles().values() + ] + + def get_hanging_locks(self, threshold: float = 600) -> List[Dict[str, Any]]: + """Return locks that have been held longer than *threshold* seconds.""" + now = time.time() + return [lock for lock in self.get_active_locks() if now - lock["created_at"] > threshold] + + # ------ BaseObserver interface ------ + + def get_status_table(self) -> str: + locks = self.get_active_locks() + if not locks: + return "No active locks." + + from tabulate import tabulate + + data = [ + { + "Handle ID": l["id"][:8] + "...", + "Locks": l["lock_count"], + "Duration": f"{l['duration_seconds']}s", + "Created": time.strftime("%H:%M:%S", time.localtime(l["created_at"])), + } + for l in locks + ] + data.append( + { + "Handle ID": f"TOTAL ({len(locks)})", + "Locks": sum(l["lock_count"] for l in locks), + "Duration": "", + "Created": "", + } + ) + return tabulate(data, headers="keys", tablefmt="pretty") + + def is_healthy(self) -> bool: + return not self.get_hanging_locks(600) + + def has_errors(self) -> bool: + return bool(self.get_hanging_locks(600)) diff --git a/openviking/storage/observers/transaction_observer.py b/openviking/storage/observers/transaction_observer.py deleted file mode 100644 index dce4555d..00000000 --- a/openviking/storage/observers/transaction_observer.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -TransactionObserver: Transaction system observability tool. - -Provides methods to observe and report transaction manager status. -""" - -import time -from typing import Any, Dict - -from openviking.storage.observers.base_observer import BaseObserver -from openviking.storage.transaction import TransactionManager -from openviking.storage.transaction.transaction_record import TransactionStatus -from openviking_cli.utils import run_async -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -class TransactionObserver(BaseObserver): - """ - TransactionObserver: System observability tool for transaction management. - - Provides methods to query transaction status and format output. - """ - - def __init__(self, transaction_manager: TransactionManager): - """Initialize transaction observer. - - Args: - transaction_manager: Transaction manager instance to observe - """ - self._transaction_manager = transaction_manager - - async def get_status_table_async(self) -> str: - """Get transaction status table asynchronously. - - Returns: - Formatted table string showing transaction status - """ - if not self._transaction_manager: - return "Transaction manager not initialized." - - transactions = self._transaction_manager.get_active_transactions() - - if not transactions: - return "No active transactions." - - return self._format_status_as_table(transactions) - - def get_status_table(self) -> str: - """Get transaction status table synchronously. - - Returns: - Formatted table string showing transaction status - """ - return run_async(self.get_status_table_async()) - - def __str__(self) -> str: - """String representation returns status table. - - Returns: - Formatted table string - """ - return self.get_status_table() - - def _format_status_as_table(self, transactions: Dict[str, Any]) -> str: - """Format transaction statuses as a table. - - Args: - transactions: Dict mapping transaction IDs to TransactionRecord - - Returns: - Formatted table string - """ - from tabulate import tabulate - - data = [] - - # Group transactions by status - status_counts = { - TransactionStatus.INIT: 0, - TransactionStatus.AQUIRE: 0, - TransactionStatus.EXEC: 0, - TransactionStatus.COMMIT: 0, - TransactionStatus.FAIL: 0, - TransactionStatus.RELEASING: 0, - TransactionStatus.RELEASED: 0, - } - - for tx_id, tx in transactions.items(): - duration = time.time() - tx.created_at - duration_str = f"{duration:.1f}s" - - status_counts[tx.status] += 1 - - data.append( - { - "Transaction ID": tx_id[:8] + "...", - "Status": str(tx.status), - "Locks": len(tx.locks), - "Duration": duration_str, - "Created": time.strftime("%H:%M:%S", time.localtime(tx.created_at)), - } - ) - - status_priority = { - TransactionStatus.EXEC: 0, - TransactionStatus.AQUIRE: 1, - TransactionStatus.RELEASING: 2, - TransactionStatus.INIT: 3, - TransactionStatus.COMMIT: 4, - TransactionStatus.FAIL: 5, - TransactionStatus.RELEASED: 6, - } - - data.sort(key=lambda x: status_priority.get(TransactionStatus(x["Status"]), 99)) - - total = len(transactions) - total_locks = sum(len(tx.locks) for tx in transactions.values()) - - summary_row = { - "Transaction ID": f"TOTAL ({total})", - "Status": "", - "Locks": total_locks, - "Duration": "", - "Created": "", - } - data.append(summary_row) - - return tabulate(data, headers="keys", tablefmt="pretty") - - def is_healthy(self) -> bool: - """Check if transaction system is healthy. - - Returns: - True if system is healthy, False otherwise - """ - return not self.has_errors() - - def has_errors(self) -> bool: - """Check if transaction system has any errors. - - Returns: - True if errors (failed transactions) exist, False otherwise - """ - if not self._transaction_manager: - return True - - transactions = self._transaction_manager.get_active_transactions() - - # Check for failed transactions - for tx_id, tx in transactions.items(): - if tx.status == TransactionStatus.FAIL: - logger.warning(f"Found failed transaction: {tx_id}") - return True - - return False - - def get_failed_transactions(self) -> Dict[str, Any]: - """Get all failed transactions. - - Returns: - Dict mapping transaction IDs to failed TransactionRecord - """ - if not self._transaction_manager: - return {} - - transactions = self._transaction_manager.get_active_transactions() - return { - tx_id: tx for tx_id, tx in transactions.items() if tx.status == TransactionStatus.FAIL - } - - def get_hanging_transactions(self, timeout_threshold: int = 300) -> Dict[str, Any]: - """Get transactions that have been running longer than threshold. - - Args: - timeout_threshold: Timeout threshold in seconds (default: 300 = 5 minutes) - - Returns: - Dict mapping transaction IDs to TransactionRecord that exceed threshold - """ - if not self._transaction_manager: - return {} - - transactions = self._transaction_manager.get_active_transactions() - current_time = time.time() - - return { - tx_id: tx - for tx_id, tx in transactions.items() - if current_time - tx.created_at > timeout_threshold - } - - def get_status_summary(self) -> Dict[str, int]: - """Get summary of transaction counts by status. - - Returns: - Dict mapping status strings to counts - """ - if not self._transaction_manager: - return {} - - transactions = self._transaction_manager.get_active_transactions() - - summary = { - "INIT": 0, - "AQUIRE": 0, - "EXEC": 0, - "COMMIT": 0, - "FAIL": 0, - "RELEASING": 0, - "RELEASED": 0, - "TOTAL": 0, - } - - for tx in transactions.values(): - summary[str(tx.status)] += 1 - summary["TOTAL"] += 1 - - return summary diff --git a/openviking/storage/queuefs/named_queue.py b/openviking/storage/queuefs/named_queue.py index ad79202b..ce7705b7 100644 --- a/openviking/storage/queuefs/named_queue.py +++ b/openviking/storage/queuefs/named_queue.py @@ -198,6 +198,21 @@ async def enqueue(self, data: Union[str, Dict[str, Any]]) -> str: msg_id = self._agfs.write(enqueue_file, data.encode("utf-8")) return msg_id if isinstance(msg_id, str) else str(msg_id) + async def ack(self, msg_id: str) -> None: + """Acknowledge successful processing of a message (deletes it from persistent storage). + + Must be called after the dequeue handler finishes processing a message. + If not called (e.g. process crashes), the message will be automatically + re-queued on the next startup via RecoverStale. + """ + if not msg_id: + return + ack_file = f"{self.path}/ack" + try: + self._agfs.write(ack_file, msg_id.encode("utf-8")) + except Exception as e: + logger.warning(f"[NamedQueue] Ack failed for {self.name} msg_id={msg_id}: {e}") + def _read_queue_message(self) -> Optional[Dict[str, Any]]: """Read and remove one message from the AGFS queue; return parsed dict or None. @@ -217,15 +232,30 @@ def _read_queue_message(self) -> Optional[Dict[str, Any]]: return json.loads(raw.decode("utf-8")) async def dequeue(self) -> Optional[Dict[str, Any]]: - """Get and remove message from queue, then invoke the dequeue handler.""" + """Dequeue a message, process it, then ack to confirm deletion. + + Flow (at-least-once delivery): + 1. Read from /dequeue → backend marks message as 'processing' (not deleted yet) + 2. Call on_dequeue() → actual processing + 3. Call ack() → backend deletes the message permanently + + If the process crashes between steps 1 and 3, the backend's RecoverStale + on the next startup resets the message back to 'pending' for retry. + """ await self._ensure_initialized() try: data = self._read_queue_message() if data is None: return None + # Capture message ID before passing data to handler (handler may modify it) + msg_id = data.get("id", "") if isinstance(data, dict) else "" if self._dequeue_handler: self._on_dequeue_start() data = await self._dequeue_handler.on_dequeue(data) + # Ack unconditionally after handler returns (success or handled error). + # If on_dequeue raises, the exception propagates and ack is skipped — + # the message will be recovered on next startup. + await self.ack(msg_id) return data except Exception as e: logger.debug(f"[NamedQueue] Dequeue failed for {self.name}: {e}") diff --git a/openviking/storage/queuefs/queue_manager.py b/openviking/storage/queuefs/queue_manager.py index 95e9aeb2..52b42476 100644 --- a/openviking/storage/queuefs/queue_manager.py +++ b/openviking/storage/queuefs/queue_manager.py @@ -107,16 +107,16 @@ def start(self) -> None: logger.info("[QueueManager] Started") - def setup_standard_queues(self, vector_store: Any) -> None: + def setup_standard_queues(self, vector_store: Any, start: bool = True) -> None: """ Setup standard queues (Embedding and Semantic) with their handlers. - This method initializes the EmbeddingQueue with TextEmbeddingHandler - and the SemanticQueue with SemanticProcessor, then ensures the - queue manager is started. - Args: vector_store: Vector store instance for handlers to write results. + start: Whether to start worker threads immediately (default True). + Pass False when the consumer depends on resources that are + not yet initialized (e.g. VikingFS); call start() manually + after those resources are ready. """ # Import handlers here to avoid circular dependencies from openviking.storage.collection_schemas import TextEmbeddingHandler @@ -140,8 +140,8 @@ def setup_standard_queues(self, vector_store: Any) -> None: ) logger.info("Semantic queue initialized with SemanticProcessor") - # Start QueueManager processing - self.start() + if start: + self.start() def _start_queue_worker(self, queue: NamedQueue) -> None: """Start a dedicated worker thread for a queue if not already running.""" @@ -207,10 +207,14 @@ async def _worker_async_concurrent( async def process_one(data: Dict[str, Any]) -> None: async with sem: + msg_id = data.get("id", "") if isinstance(data, dict) else "" try: await queue.process_dequeued(data) + # Ack after successful processing (delete from persistent storage). + await queue.ack(msg_id) except Exception as e: # Handler did not call report_error; decrement in_progress manually. + # Do NOT ack — let RecoverStale re-queue on next startup. queue._on_process_error(str(e), data) logger.error(f"[QueueManager] Concurrent worker error for {queue.name}: {e}") @@ -241,9 +245,21 @@ async def process_one(data: Dict[str, Any]) -> None: await asyncio.sleep(self._poll_interval) - # Drain remaining in-flight tasks on shutdown + # Drain remaining in-flight tasks on shutdown (with timeout) if active_tasks: - await asyncio.gather(*active_tasks, return_exceptions=True) + try: + await asyncio.wait_for( + asyncio.gather(*active_tasks, return_exceptions=True), + timeout=5.0, + ) + except asyncio.TimeoutError: + logger.warning( + f"[QueueManager] Drain timeout for {queue.name}, " + f"cancelling {len(active_tasks)} in-flight task(s)" + ) + for t in active_tasks: + t.cancel() + await asyncio.gather(*active_tasks, return_exceptions=True) def stop(self) -> None: """Stop QueueManager and release resources.""" @@ -254,8 +270,10 @@ def stop(self) -> None: # Stop queue workers for stop_event in self._queue_stop_events.values(): stop_event.set() - for thread in self._queue_threads.values(): - thread.join() + for name, thread in self._queue_threads.items(): + thread.join(timeout=10.0) + if thread.is_alive(): + logger.warning(f"[QueueManager] Worker thread {name} did not exit in time") self._queue_threads.clear() self._queue_stop_events.clear() @@ -280,9 +298,6 @@ def get_queue( allow_create: bool = False, ) -> NamedQueue: """Get or create a named queue object.""" - if not self._started: - self.start() - if name not in self._queues: if not allow_create: raise RuntimeError(f"Queue {name} does not exist and allow_create is False") diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index b1e6407c..d626fa82 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -75,6 +75,7 @@ def __init__( target_uri: Optional[str] = None, semantic_msg_id: Optional[str] = None, recursive: bool = True, + lifecycle_lock_handle_id: str = "", ): self._processor = processor self._context_type = context_type @@ -84,6 +85,7 @@ def __init__( self._target_uri = target_uri self._semantic_msg_id = semantic_msg_id self._recursive = recursive + self._lifecycle_lock_handle_id = lifecycle_lock_handle_id self._llm_sem = asyncio.Semaphore(max_concurrent_llm) self._viking_fs = get_viking_fs() self._nodes: Dict[str, DirNode] = {} @@ -98,6 +100,7 @@ def __init__( self._dir_change_status: Dict[str, bool] = {} self._overview_cache: Dict[str, Dict[str, str]] = {} self._overview_cache_lock = asyncio.Lock() + self._refresh_task: Optional[asyncio.Task] = None def _create_on_complete_callback(self) -> Optional[Callable[[], Awaitable[None]]]: """Create on_complete callback for incremental update or full update.""" @@ -160,10 +163,27 @@ async def run(self, root_uri: str) -> None: """Run DAG execution starting from root_uri.""" self._root_uri = root_uri self._root_done = asyncio.Event() - await self._dispatch_dir(root_uri, parent_uri=None) - await self._root_done.wait() - on_complete = self._create_on_complete_callback() + # Start lifecycle lock refresh loop if we hold a lock + if self._lifecycle_lock_handle_id: + self._refresh_task = asyncio.create_task(self._lock_refresh_loop()) + + try: + await self._dispatch_dir(root_uri, parent_uri=None) + await self._root_done.wait() + except Exception: + await self._release_lifecycle_lock() + raise + + original_on_complete = self._create_on_complete_callback() + + # Wrap on_complete to release lifecycle lock after all processing + async def wrapped_on_complete() -> None: + try: + if original_on_complete: + await original_on_complete() + finally: + await self._release_lifecycle_lock() async with self._vectorize_lock: task_count = self._vectorize_task_count @@ -176,7 +196,7 @@ async def run(self, root_uri: str) -> None: await tracker.register( semantic_msg_id=self._semantic_msg_id, total_count=task_count, - on_complete=on_complete, + on_complete=wrapped_on_complete, metadata={"uri": root_uri}, ) @@ -203,9 +223,10 @@ async def run(self, root_uri: str) -> None: semantic_msg_id=task.semantic_msg_id, ) ) - elif on_complete: + else: + # No vectorize tasks — release lock immediately (via wrapped callback) try: - await on_complete() + await wrapped_on_complete() except Exception as e: logger.error(f"Error in on_complete callback: {e}", exc_info=True) @@ -510,6 +531,7 @@ async def _overview_task(self, dir_uri: str) -> None: return need_vectorize = True children_changed = True + abstract = "" try: overview = None abstract = None @@ -532,11 +554,12 @@ async def _overview_task(self, dir_uri: str) -> None: abstract = self._processor._extract_abstract_from_overview(overview) overview, abstract = self._processor._enforce_size_limits(overview, abstract) + # Write directly — protected by the outer lifecycle SUBTREE lock try: await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx) await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx) - except Exception as e: - logger.warning(f"Failed to write overview/abstract for {dir_uri}: {e}") + except Exception: + logger.info(f"[SemanticDag] {dir_uri} write failed, skipping") try: if need_vectorize: @@ -555,7 +578,6 @@ async def _overview_task(self, dir_uri: str) -> None: except Exception as e: logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True) - abstract = "" finally: self._stats.done_nodes += 1 self._stats.in_progress_nodes = max(0, self._stats.in_progress_nodes - 1) @@ -579,6 +601,46 @@ async def _add_vectorize_task(self, task: VectorizeTask) -> None: else: # directory self._vectorize_task_count += 2 + async def _lock_refresh_loop(self) -> None: + """Periodically refresh lifecycle lock to prevent stale expiry.""" + from openviking.storage.transaction import get_lock_manager + + try: + interval = get_lock_manager()._path_lock._lock_expire / 2 + except Exception: + interval = 150.0 + + while True: + try: + await asyncio.sleep(interval) + handle = get_lock_manager().get_handle(self._lifecycle_lock_handle_id) + if handle: + await get_lock_manager().refresh_lock(handle) + else: + break + except asyncio.CancelledError: + break + except Exception as e: + logger.warning(f"[SemanticDag] Lock refresh failed: {e}") + + async def _release_lifecycle_lock(self) -> None: + """Stop refresh loop and release lifecycle lock.""" + if self._refresh_task and not self._refresh_task.done(): + self._refresh_task.cancel() + self._refresh_task = None + if not self._lifecycle_lock_handle_id: + return + handle_id = self._lifecycle_lock_handle_id + self._lifecycle_lock_handle_id = "" + try: + from openviking.storage.transaction import get_lock_manager + + handle = get_lock_manager().get_handle(handle_id) + if handle: + await get_lock_manager().release(handle) + except Exception as e: + logger.warning(f"[SemanticDag] Failed to release lifecycle lock {handle_id}: {e}") + def get_stats(self) -> DagStats: return DagStats( total_nodes=self._stats.total_nodes, diff --git a/openviking/storage/queuefs/semantic_msg.py b/openviking/storage/queuefs/semantic_msg.py index f6acdaf4..720948e8 100644 --- a/openviking/storage/queuefs/semantic_msg.py +++ b/openviking/storage/queuefs/semantic_msg.py @@ -39,6 +39,7 @@ class SemanticMsg: skip_vectorization: bool = False telemetry_id: str = "" target_uri: str = "" + lifecycle_lock_handle_id: str = "" changes: Optional[Dict[str, List[str]]] = ( None # {"added": [...], "modified": [...], "deleted": [...]} ) @@ -55,6 +56,7 @@ def __init__( skip_vectorization: bool = False, telemetry_id: str = "", target_uri: str = "", + lifecycle_lock_handle_id: str = "", changes: Optional[Dict[str, List[str]]] = None, ): self.id = str(uuid4()) @@ -68,6 +70,7 @@ def __init__( self.skip_vectorization = skip_vectorization self.telemetry_id = telemetry_id self.target_uri = target_uri + self.lifecycle_lock_handle_id = lifecycle_lock_handle_id self.changes = changes def to_dict(self) -> Dict[str, Any]: @@ -106,6 +109,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SemanticMsg": skip_vectorization=data.get("skip_vectorization", False), telemetry_id=data.get("telemetry_id", ""), target_uri=data.get("target_uri", ""), + lifecycle_lock_handle_id=data.get("lifecycle_lock_handle_id", ""), changes=data.get("changes"), ) if "id" in data and data["id"]: diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index b4c9ac5c..98d32f7a 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -239,6 +239,14 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, f"Target URI exists, using incremental update: {msg.target_uri}" ) + # Re-acquire lifecycle lock if handle was lost (e.g. server restart) + if msg.lifecycle_lock_handle_id: + lock_uri = msg.target_uri or msg.uri + msg.lifecycle_lock_handle_id = await self._ensure_lifecycle_lock( + msg.lifecycle_lock_handle_id, + viking_fs._uri_to_path(lock_uri, ctx=self._current_ctx), + ) + executor = SemanticDagExecutor( processor=self, context_type=msg.context_type, @@ -248,6 +256,7 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, target_uri=msg.target_uri, semantic_msg_id=msg.id, recursive=msg.recursive, + lifecycle_lock_handle_id=msg.lifecycle_lock_handle_id, ) self._dag_executor = executor await executor.run(msg.uri) @@ -268,6 +277,22 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, self.report_error(str(e), data) return None finally: + # Safety net: release lifecycle lock if still held (e.g. on exception + # before the DAG executor took ownership) + if msg and msg.lifecycle_lock_handle_id: + try: + from openviking.storage.transaction import get_lock_manager + + lm = get_lock_manager() + handle = lm.get_handle(msg.lifecycle_lock_handle_id) + if handle: + await lm.release(handle) + logger.info( + f"[SemanticProcessor] Safety-net released lifecycle lock " + f"{msg.lifecycle_lock_handle_id}" + ) + except Exception: + pass self._current_msg = None self._current_ctx = None @@ -276,6 +301,25 @@ def get_dag_stats(self) -> Optional["DagStats"]: return None return self._dag_executor.get_stats() + @staticmethod + async def _ensure_lifecycle_lock(handle_id: str, lock_path: str) -> str: + """If the handle is missing (server restart), re-acquire a SUBTREE lock. + + Returns the (possibly new) handle ID, or "" on failure. + """ + from openviking.storage.transaction import get_lock_manager + + lm = get_lock_manager() + if lm.get_handle(handle_id): + return handle_id + new_handle = lm.create_handle() + if await lm.acquire_subtree(new_handle, lock_path): + logger.info(f"Re-acquired lifecycle lock on {lock_path} (handle {new_handle.id})") + return new_handle.id + logger.warning(f"Failed to re-acquire lifecycle lock on {lock_path}") + await lm.release(new_handle) + return "" + async def _process_memory_directory(self, msg: SemanticMsg) -> None: """Process a memory directory with special handling. diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py index b6c06d6e..52e77ff7 100644 --- a/openviking/storage/transaction/__init__.py +++ b/openviking/storage/transaction/__init__.py @@ -3,25 +3,30 @@ """ Transaction module for OpenViking. -Provides transaction management and lock mechanisms for data operations. +Provides path-lock management and redo-log crash recovery. """ -from openviking.storage.transaction.path_lock import PathLock -from openviking.storage.transaction.transaction_manager import ( - TransactionManager, - get_transaction_manager, - init_transaction_manager, -) -from openviking.storage.transaction.transaction_record import ( - TransactionRecord, - TransactionStatus, +from openviking.storage.transaction.lock_context import LockContext +from openviking.storage.transaction.lock_handle import LockHandle, LockOwner +from openviking.storage.transaction.lock_manager import ( + LockManager, + get_lock_manager, + init_lock_manager, + release_all_locks, + reset_lock_manager, ) +from openviking.storage.transaction.path_lock import PathLock +from openviking.storage.transaction.redo_log import RedoLog __all__ = [ + "LockContext", + "LockHandle", + "LockManager", + "LockOwner", "PathLock", - "TransactionManager", - "TransactionRecord", - "TransactionStatus", - "init_transaction_manager", - "get_transaction_manager", + "RedoLog", + "get_lock_manager", + "init_lock_manager", + "release_all_locks", + "reset_lock_manager", ] diff --git a/openviking/storage/transaction/lock_context.py b/openviking/storage/transaction/lock_context.py new file mode 100644 index 00000000..4d1d8443 --- /dev/null +++ b/openviking/storage/transaction/lock_context.py @@ -0,0 +1,68 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""LockContext — async context manager for acquiring/releasing path locks.""" + +from typing import Optional + +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.lock_handle import LockHandle +from openviking.storage.transaction.lock_manager import LockManager + + +class LockContext: + """``async with LockContext(manager, paths, mode) as handle: ...`` + + Acquires locks on entry, releases them on exit. No undo / journal / commit + semantics — just a lock scope. + """ + + def __init__( + self, + lock_manager: LockManager, + paths: list[str], + lock_mode: str = "point", + mv_dst_parent_path: Optional[str] = None, + src_is_dir: bool = True, + ): + self._manager = lock_manager + self._paths = paths + self._lock_mode = lock_mode + self._mv_dst_parent_path = mv_dst_parent_path + self._src_is_dir = src_is_dir + self._handle: Optional[LockHandle] = None + + async def __aenter__(self) -> LockHandle: + self._handle = self._manager.create_handle() + success = False + + if self._lock_mode == "subtree": + for path in self._paths: + success = await self._manager.acquire_subtree(self._handle, path) + if not success: + break + elif self._lock_mode == "mv": + if self._mv_dst_parent_path is None: + raise LockAcquisitionError("mv lock mode requires mv_dst_parent_path") + success = await self._manager.acquire_mv( + self._handle, + self._paths[0], + self._mv_dst_parent_path, + src_is_dir=self._src_is_dir, + ) + else: # "point" + for path in self._paths: + success = await self._manager.acquire_point(self._handle, path) + if not success: + break + + if not success: + await self._manager.release(self._handle) + raise LockAcquisitionError( + f"Failed to acquire {self._lock_mode} lock for {self._paths}" + ) + return self._handle + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self._handle: + await self._manager.release(self._handle) + return False diff --git a/openviking/storage/transaction/lock_handle.py b/openviking/storage/transaction/lock_handle.py new file mode 100644 index 00000000..7b5be5d9 --- /dev/null +++ b/openviking/storage/transaction/lock_handle.py @@ -0,0 +1,37 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Lock handle and LockOwner protocol for PathLock integration.""" + +import time +import uuid +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class LockOwner(Protocol): + """Minimal interface that PathLock requires from its caller.""" + + id: str + locks: list[str] + + def add_lock(self, path: str) -> None: ... + def remove_lock(self, path: str) -> None: ... + + +@dataclass +class LockHandle: + """Identifies a lock holder. PathLock uses ``id`` to generate fencing tokens + and ``locks`` to track acquired lock files.""" + + id: str = field(default_factory=lambda: str(uuid.uuid4())) + locks: list[str] = field(default_factory=list) + created_at: float = field(default_factory=time.time) + + def add_lock(self, lock_path: str) -> None: + if lock_path not in self.locks: + self.locks.append(lock_path) + + def remove_lock(self, lock_path: str) -> None: + if lock_path in self.locks: + self.locks.remove(lock_path) diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py new file mode 100644 index 00000000..2fec7e42 --- /dev/null +++ b/openviking/storage/transaction/lock_manager.py @@ -0,0 +1,263 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""LockManager — global singleton managing lock lifecycle and redo recovery.""" + +import asyncio +import json +import time +from typing import Any, Dict, Optional + +from openviking.pyagfs import AGFSClient +from openviking.storage.transaction.lock_handle import LockHandle +from openviking.storage.transaction.path_lock import PathLock +from openviking.storage.transaction.redo_log import RedoLog +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class LockManager: + """Global singleton. Manages lock lifecycle and stale cleanup.""" + + def __init__( + self, + agfs: AGFSClient, + lock_timeout: float = 0.0, + lock_expire: float = 300.0, + ): + self._agfs = agfs + self._path_lock = PathLock(agfs, lock_expire=lock_expire) + self._lock_timeout = lock_timeout + self._redo_log = RedoLog(agfs) + self._handles: Dict[str, LockHandle] = {} + self._cleanup_task: Optional[asyncio.Task] = None + self._running = False + + @property + def redo_log(self) -> RedoLog: + return self._redo_log + + def get_active_handles(self) -> Dict[str, LockHandle]: + return dict(self._handles) + + async def start(self) -> None: + """Start background cleanup and redo recovery.""" + self._running = True + self._cleanup_task = asyncio.create_task(self._stale_cleanup_loop()) + await self._recover_pending_redo() + + async def stop(self) -> None: + """Stop cleanup and release all active locks.""" + self._running = False + if self._cleanup_task: + self._cleanup_task.cancel() + try: + if self._cleanup_task.get_loop() is asyncio.get_running_loop(): + await self._cleanup_task + except asyncio.CancelledError: + pass + self._cleanup_task = None + for handle in list(self._handles.values()): + await self._path_lock.release(handle) + self._handles.clear() + + def create_handle(self) -> LockHandle: + handle = LockHandle() + self._handles[handle.id] = handle + return handle + + async def acquire_point( + self, handle: LockHandle, path: str, timeout: Optional[float] = None + ) -> bool: + return await self._path_lock.acquire_point( + path, handle, timeout=timeout if timeout is not None else self._lock_timeout + ) + + async def acquire_subtree( + self, handle: LockHandle, path: str, timeout: Optional[float] = None + ) -> bool: + return await self._path_lock.acquire_subtree( + path, handle, timeout=timeout if timeout is not None else self._lock_timeout + ) + + async def acquire_mv( + self, + handle: LockHandle, + src: str, + dst_parent: str, + src_is_dir: bool = True, + timeout: Optional[float] = None, + ) -> bool: + return await self._path_lock.acquire_mv( + src, + dst_parent, + handle, + timeout=timeout if timeout is not None else self._lock_timeout, + src_is_dir=src_is_dir, + ) + + def get_handle(self, handle_id: str) -> Optional[LockHandle]: + return self._handles.get(handle_id) + + async def refresh_lock(self, handle: LockHandle) -> None: + await self._path_lock.refresh(handle) + + async def release(self, handle: LockHandle) -> None: + await self._path_lock.release(handle) + self._handles.pop(handle.id, None) + + async def _stale_cleanup_loop(self) -> None: + """Check and release leaked handles every 60 s (in-process safety net).""" + while self._running: + await asyncio.sleep(60) + now = time.time() + stale = [h for h in self._handles.values() if now - h.created_at > 3600] + for handle in stale: + logger.warning(f"Releasing stale lock handle {handle.id}") + await self.release(handle) + + # ------------------------------------------------------------------ + # Redo recovery (session_memory only) + # ------------------------------------------------------------------ + + async def _recover_pending_redo(self) -> None: + pending_ids = self._redo_log.list_pending() + for task_id in pending_ids: + logger.info(f"Recovering pending redo task: {task_id}") + try: + info = self._redo_log.read(task_id) + if info: + await self._redo_session_memory(info) + self._redo_log.mark_done(task_id) + except Exception as e: + logger.error(f"Redo recovery failed for {task_id}: {e}", exc_info=True) + + async def _redo_session_memory(self, info: Dict[str, Any]) -> None: + """Re-extract memories from archive. + + Lets exceptions from _enqueue_semantic propagate so the caller + can decide whether to mark the redo task as done. + """ + from openviking.message import Message + from openviking.server.identity import RequestContext, Role + from openviking.session.compressor import SessionCompressor + from openviking.storage.viking_fs import get_viking_fs + from openviking_cli.session.user_id import UserIdentifier + + archive_uri = info.get("archive_uri") + session_uri = info.get("session_uri") + account_id = info.get("account_id", "default") + user_id = info.get("user_id", "default") + agent_id = info.get("agent_id", "default") + role_str = info.get("role", "root") + + if not archive_uri or not session_uri: + raise ValueError("Cannot redo session_memory: missing archive_uri or session_uri") + + # 1. Build request context (needed for path conversion below) + user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id) + ctx = RequestContext(user=user, role=Role(role_str)) + + # 2. Read archived messages + messages_uri = f"{archive_uri}/messages.jsonl" + viking_fs = get_viking_fs() + agfs_path = viking_fs._uri_to_path(messages_uri, ctx=ctx) + messages = [] + try: + content = self._agfs.cat(agfs_path) + if isinstance(content, bytes): + content = content.decode("utf-8") + for line in content.strip().split("\n"): + if line.strip(): + try: + messages.append(Message.from_dict(json.loads(line))) + except Exception: + pass + except Exception as e: + logger.warning(f"Cannot read archive for redo: {agfs_path}: {e}") + + # 3. Re-extract memories (best-effort, only if archive was readable) + if messages: + session_id = session_uri.rstrip("/").rsplit("/", 1)[-1] + try: + compressor = SessionCompressor(vikingdb=None) + memories = await compressor.extract_long_term_memories( + messages=messages, + user=user, + session_id=session_id, + ctx=ctx, + ) + logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}") + except Exception as e: + logger.warning(f"Redo: memory extraction failed ({e}), falling back to queue") + + # 4. Always enqueue semantic processing as fallback + await self._enqueue_semantic( + uri=session_uri, + context_type="memory", + account_id=account_id, + user_id=user_id, + agent_id=agent_id, + role=role_str, + ) + + async def _enqueue_semantic(self, **params: Any) -> None: + from openviking.storage.queuefs import get_queue_manager + from openviking.storage.queuefs.semantic_msg import SemanticMsg + from openviking.storage.queuefs.semantic_queue import SemanticQueue + + queue_manager = get_queue_manager() + if queue_manager is None: + logger.debug("No queue manager available, skipping enqueue_semantic") + return + + uri = params.get("uri") + if not uri: + return + + msg = SemanticMsg( + uri=uri, + context_type=params.get("context_type", "resource"), + account_id=params.get("account_id", "default"), + user_id=params.get("user_id", "default"), + agent_id=params.get("agent_id", "default"), + role=params.get("role", "root"), + ) + semantic_queue: SemanticQueue = queue_manager.get_queue(queue_manager.SEMANTIC) # type: ignore[assignment] + await semantic_queue.enqueue(msg) + + +# --------------------------------------------------------------------------- +# Module-level singleton +# --------------------------------------------------------------------------- + +_lock_manager: Optional[LockManager] = None + + +def init_lock_manager( + agfs: AGFSClient, + lock_timeout: float = 0.0, + lock_expire: float = 300.0, +) -> LockManager: + global _lock_manager + _lock_manager = LockManager(agfs=agfs, lock_timeout=lock_timeout, lock_expire=lock_expire) + return _lock_manager + + +def get_lock_manager() -> LockManager: + if _lock_manager is None: + raise RuntimeError("LockManager not initialized. Call init_lock_manager() first.") + return _lock_manager + + +def reset_lock_manager() -> None: + global _lock_manager + _lock_manager = None + + +async def release_all_locks() -> None: + """Release all active lock handles. **Test-only utility.**""" + if _lock_manager is None: + return + for handle in list(_lock_manager.get_active_handles().values()): + await _lock_manager.release(handle) diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index 1a2ad7b0..2aaaecf1 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -1,17 +1,9 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Path lock implementation for transaction management. - -Provides path-based locking mechanism to prevent concurrent directory operations. -Lock protocol: viking://resources/.../.path.ovlock file exists = locked -""" - import asyncio -from typing import List, Optional +import time +from typing import Optional, Tuple from openviking.pyagfs import AGFSClient -from openviking.storage.transaction.transaction_record import TransactionRecord +from openviking.storage.transaction.lock_handle import LockOwner from openviking_cli.utils.logger import get_logger logger = get_logger(__name__) @@ -19,312 +11,383 @@ # Lock file name LOCK_FILE_NAME = ".path.ovlock" +# Lock type constants +LOCK_TYPE_POINT = "P" +LOCK_TYPE_SUBTREE = "S" -class PathLock: - """Path lock manager for transaction-based directory locking. +# Default poll interval when waiting for a lock (seconds) +_POLL_INTERVAL = 0.2 - Implements path-based locking using lock files (.path.ovlock) to prevent - concurrent operations on the same directory tree. - """ - def __init__(self, agfs_client: AGFSClient): - """Initialize path lock manager. +def _make_fencing_token(owner_id: str, lock_type: str = LOCK_TYPE_POINT) -> str: + return f"{owner_id}:{time.time_ns()}:{lock_type}" - Args: - agfs_client: AGFS client for file system operations - """ - self._agfs = agfs_client - def _get_lock_path(self, path: str) -> str: - """Get lock file path for a directory. +def _parse_fencing_token(token: str) -> Tuple[str, int, str]: + if token.endswith(f":{LOCK_TYPE_POINT}") or token.endswith(f":{LOCK_TYPE_SUBTREE}"): + lock_type = token[-1] + rest = token[:-2] + idx = rest.rfind(":") + if idx >= 0: + owner_id_part = rest[:idx] + ts_part = rest[idx + 1 :] + try: + return owner_id_part, int(ts_part), lock_type + except ValueError: + pass + return rest, 0, lock_type - Args: - path: Directory path to lock + if ":" in token: + idx = token.rfind(":") + owner_id_part = token[:idx] + ts_part = token[idx + 1 :] + try: + return owner_id_part, int(ts_part), LOCK_TYPE_POINT + except ValueError: + pass - Returns: - Lock file path (path/.path.ovlock) - """ - # Remove trailing slash if present + return token, 0, LOCK_TYPE_POINT + + +class PathLock: + def __init__(self, agfs_client: AGFSClient, lock_expire: float = 300.0): + self._agfs = agfs_client + self._lock_expire = lock_expire + + def _get_lock_path(self, path: str) -> str: path = path.rstrip("/") return f"{path}/{LOCK_FILE_NAME}" def _get_parent_path(self, path: str) -> Optional[str]: - """Get parent directory path. - - Args: - path: Directory path - - Returns: - Parent directory path or None if at root - """ path = path.rstrip("/") if "/" not in path: return None parent = path.rsplit("/", 1)[0] return parent if parent else None - async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool: - """Check if path is locked by another transaction. - - Args: - lock_path: Lock file path - transaction_id: Current transaction ID - - Returns: - True if locked by another transaction, False otherwise - """ + def _read_token(self, lock_path: str) -> Optional[str]: try: - content = self._agfs.cat(lock_path) + content = self._agfs.read(lock_path) if isinstance(content, bytes): - lock_owner = content.decode("utf-8").strip() + token = content.decode("utf-8").strip() else: - lock_owner = str(content).strip() - return lock_owner != transaction_id + token = str(content).strip() + return token if token else None except Exception: - # Lock file doesn't exist or can't be read - not locked - return False - - async def _create_lock_file(self, lock_path: str, transaction_id: str) -> None: - """Create lock file with transaction ID. - - Args: - lock_path: Lock file path - transaction_id: Transaction ID to write to lock file - """ - self._agfs.write(lock_path, transaction_id.encode("utf-8")) - - async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool: - """Verify lock file is owned by current transaction. - - Args: - lock_path: Lock file path - transaction_id: Current transaction ID + return None - Returns: - True if lock is owned by current transaction, False otherwise - """ - try: - content = self._agfs.cat(lock_path) - if isinstance(content, bytes): - lock_owner = content.decode("utf-8").strip() - else: - lock_owner = str(content).strip() - return lock_owner == transaction_id - except Exception: + async def _is_locked_by_other(self, lock_path: str, owner_id: str) -> bool: + token = self._read_token(lock_path) + if token is None: return False + lock_owner, _, _ = _parse_fencing_token(token) + return lock_owner != owner_id + + async def _create_lock_file( + self, lock_path: str, owner_id: str, lock_type: str = LOCK_TYPE_POINT + ) -> None: + token = _make_fencing_token(owner_id, lock_type) + self._agfs.write(lock_path, token.encode("utf-8")) + + async def _verify_lock_ownership(self, lock_path: str, owner_id: str) -> bool: + token = self._read_token(lock_path) + if token is None: + return False + lock_owner, _, _ = _parse_fencing_token(token) + return lock_owner == owner_id - async def _remove_lock_file(self, lock_path: str) -> None: - """Remove lock file. - - Args: - lock_path: Lock file path - """ + async def _remove_lock_file(self, lock_path: str) -> bool: try: self._agfs.rm(lock_path) - except Exception: - # Lock file might not exist, ignore - pass - - async def acquire_normal(self, path: str, transaction: TransactionRecord) -> bool: - """Acquire path lock for normal operations. - - Lock acquisition flow for normal operations: - 1. Check if target directory exists - 2. Check if target directory is locked by another transaction - 3. Check if parent directory is locked by another transaction - 4. Create .path.ovlock file with transaction ID - 5. Check again if parent directory is locked by another transaction - 6. Read lock file to confirm it contains current transaction ID - 7. Return success if all checks pass + return True + except Exception as e: + if "not found" in str(e).lower(): + return True + return False - Args: - path: Directory path to lock - transaction: Transaction record + def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool: + token = self._read_token(lock_path) + if token is None: + return True + _, ts, _ = _parse_fencing_token(token) + if ts == 0: + return True + age = (time.time_ns() - ts) / 1e9 + return age > expire_seconds + + async def _check_ancestors_for_subtree(self, path: str, exclude_owner_id: str) -> Optional[str]: + parent = self._get_parent_path(path) + while parent: + lock_path = self._get_lock_path(parent) + token = self._read_token(lock_path) + if token is not None: + owner_id, _, lock_type = _parse_fencing_token(token) + if owner_id != exclude_owner_id and lock_type == LOCK_TYPE_SUBTREE: + return lock_path + parent = self._get_parent_path(parent) + return None + + async def _scan_descendants_for_locks(self, path: str, exclude_owner_id: str) -> Optional[str]: + try: + entries = self._agfs.ls(path) + if not isinstance(entries, list): + return None + for entry in entries: + if not isinstance(entry, dict): + continue + name = entry.get("name", "") + if not name or name in (".", ".."): + continue + if not entry.get("isDir", False): + continue + subdir = f"{path.rstrip('/')}/{name}" + subdir_lock = self._get_lock_path(subdir) + token = self._read_token(subdir_lock) + if token is not None: + owner_id, _, _ = _parse_fencing_token(token) + if owner_id != exclude_owner_id: + return subdir_lock + result = await self._scan_descendants_for_locks(subdir, exclude_owner_id) + if result: + return result + except Exception as e: + logger.warning(f"Failed to scan descendants of {path}: {e}") + return None - Returns: - True if lock acquired successfully, False otherwise - """ - transaction_id = transaction.id + async def acquire_point(self, path: str, owner: LockOwner, timeout: float = 0.0) -> bool: + owner_id = owner.id lock_path = self._get_lock_path(path) - parent_path = self._get_parent_path(path) + deadline = asyncio.get_running_loop().time() + timeout - # Step 1: Check if target directory exists try: self._agfs.stat(path) except Exception: - logger.warning(f"Directory does not exist: {path}") + logger.warning(f"[POINT] Directory does not exist: {path}") return False - # Step 2: Check if target directory is locked by another transaction - if await self._is_locked_by_other(lock_path, transaction_id): - logger.warning(f"Path already locked by another transaction: {path}") - return False - - # Step 3: Check if parent directory is locked by another transaction - if parent_path: - parent_lock_path = self._get_lock_path(parent_path) - if await self._is_locked_by_other(parent_lock_path, transaction_id): - logger.warning(f"Parent path locked by another transaction: {parent_path}") + while True: + if await self._is_locked_by_other(lock_path, owner_id): + if self.is_lock_stale(lock_path, self._lock_expire): + logger.warning(f"[POINT] Removing stale lock: {lock_path}") + await self._remove_lock_file(lock_path) + continue + if asyncio.get_running_loop().time() >= deadline: + logger.warning(f"[POINT] Timeout waiting for lock on: {path}") + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + ancestor_conflict = await self._check_ancestors_for_subtree(path, owner_id) + if ancestor_conflict: + if self.is_lock_stale(ancestor_conflict, self._lock_expire): + logger.warning( + f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" + ) + await self._remove_lock_file(ancestor_conflict) + continue + if asyncio.get_running_loop().time() >= deadline: + logger.warning( + f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + try: + await self._create_lock_file(lock_path, owner_id, LOCK_TYPE_POINT) + except Exception as e: + logger.error(f"[POINT] Failed to create lock file: {e}") return False - # Step 4: Create lock file - try: - await self._create_lock_file(lock_path, transaction_id) - except Exception as e: - logger.error(f"Failed to create lock file: {e}") - return False - - # Step 5: Check again if parent directory is locked - if parent_path: - parent_lock_path = self._get_lock_path(parent_path) - if await self._is_locked_by_other(parent_lock_path, transaction_id): - logger.warning(f"Parent path locked after lock creation: {parent_path}") - await self._remove_lock_file(lock_path) - return False - - # Step 6: Verify lock ownership - if not await self._verify_lock_ownership(lock_path, transaction_id): - logger.error(f"Lock ownership verification failed: {path}") - return False - - # Step 7: Success - add lock to transaction - transaction.add_lock(lock_path) - logger.debug(f"Lock acquired: {lock_path}") - return True - - async def _collect_subdirectories(self, path: str) -> List[str]: - """Collect all subdirectory paths recursively. - - Args: - path: Root directory path - - Returns: - List of all subdirectory paths - """ - subdirs = [] - try: - entries = self._agfs.ls(path) - if isinstance(entries, list): - for entry in entries: - if isinstance(entry, dict) and entry.get("isDir"): - entry_path = entry.get("name", "") - if entry_path: - subdirs.append(entry_path) - # Recursively collect subdirectories - subdirs.extend(await self._collect_subdirectories(entry_path)) - except Exception as e: - logger.warning(f"Failed to list directory {path}: {e}") - - return subdirs - - async def acquire_rm( - self, path: str, transaction: TransactionRecord, max_parallel: int = 8 - ) -> bool: - """Acquire path lock for rm operation using bottom-up parallel locking. - - Lock acquisition flow for rm operations (parallel bottom-up mode): - 1. Collect all subdirectory paths recursively - 2. Sort by depth (deepest first) - 3. Create lock files in batches with limited parallelism - 4. Lock the target directory last - 5. If any lock fails, release all acquired locks in reverse order - - Args: - path: Directory path to lock - transaction: Transaction record - max_parallel: Maximum number of parallel lock operations + backed_off = False + conflict_after = await self._check_ancestors_for_subtree(path, owner_id) + if conflict_after: + their_token = self._read_token(conflict_after) + if their_token: + their_owner_id, their_ts, _ = _parse_fencing_token(their_token) + my_token = self._read_token(lock_path) + _, my_ts, _ = ( + _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT) + ) + if (my_ts, owner_id) > (their_ts, their_owner_id): + logger.debug(f"[POINT] Backing off (livelock guard) on {path}") + await self._remove_lock_file(lock_path) + backed_off = True + if asyncio.get_running_loop().time() >= deadline: + if not backed_off: + await self._remove_lock_file(lock_path) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + if not await self._verify_lock_ownership(lock_path, owner_id): + logger.debug(f"[POINT] Lock ownership verification failed: {path}") + if asyncio.get_running_loop().time() >= deadline: + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + owner.add_lock(lock_path) + logger.debug(f"[POINT] Lock acquired: {lock_path}") + return True - Returns: - True if all locks acquired successfully, False otherwise - """ - transaction_id = transaction.id + async def acquire_subtree(self, path: str, owner: LockOwner, timeout: float = 0.0) -> bool: + owner_id = owner.id lock_path = self._get_lock_path(path) - acquired_locks = [] + deadline = asyncio.get_running_loop().time() + timeout - # Step 1: Collect all subdirectories - subdirs = await self._collect_subdirectories(path) + try: + self._agfs.stat(path) + except Exception: + logger.warning(f"[SUBTREE] Directory does not exist: {path}") + return False - # Step 2: Sort by depth (deepest first) - subdirs.sort(key=lambda p: p.count("/"), reverse=True) + while True: + if await self._is_locked_by_other(lock_path, owner_id): + if self.is_lock_stale(lock_path, self._lock_expire): + logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}") + await self._remove_lock_file(lock_path) + continue + if asyncio.get_running_loop().time() >= deadline: + logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Check ancestor paths for SUBTREE locks held by other owners + ancestor_conflict = await self._check_ancestors_for_subtree(path, owner_id) + if ancestor_conflict: + if self.is_lock_stale(ancestor_conflict, self._lock_expire): + logger.warning( + f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" + ) + await self._remove_lock_file(ancestor_conflict) + continue + if asyncio.get_running_loop().time() >= deadline: + logger.warning( + f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + desc_conflict = await self._scan_descendants_for_locks(path, owner_id) + if desc_conflict: + if self.is_lock_stale(desc_conflict, self._lock_expire): + logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}") + await self._remove_lock_file(desc_conflict) + continue + if asyncio.get_running_loop().time() >= deadline: + logger.warning( + f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + try: + await self._create_lock_file(lock_path, owner_id, LOCK_TYPE_SUBTREE) + except Exception as e: + logger.error(f"[SUBTREE] Failed to create lock file: {e}") + return False - # Step 3: Create lock files in batches - try: - # Lock subdirectories in batches - for i in range(0, len(subdirs), max_parallel): - batch = subdirs[i : i + max_parallel] - tasks = [] - for subdir in batch: - subdir_lock_path = self._get_lock_path(subdir) - tasks.append(self._create_lock_file(subdir_lock_path, transaction_id)) - - # Execute batch in parallel - await asyncio.gather(*tasks) - acquired_locks.extend([self._get_lock_path(s) for s in batch]) - - # Step 4: Lock target directory - await self._create_lock_file(lock_path, transaction_id) - acquired_locks.append(lock_path) - - # Add all locks to transaction - for lock in acquired_locks: - transaction.add_lock(lock) - - logger.debug(f"RM locks acquired for {len(acquired_locks)} paths") + backed_off = False + conflict_after = await self._scan_descendants_for_locks(path, owner_id) + if not conflict_after: + conflict_after = await self._check_ancestors_for_subtree(path, owner_id) + if conflict_after: + their_token = self._read_token(conflict_after) + if their_token: + their_owner_id, their_ts, _ = _parse_fencing_token(their_token) + my_token = self._read_token(lock_path) + _, my_ts, _ = ( + _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE) + ) + if (my_ts, owner_id) > (their_ts, their_owner_id): + logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}") + await self._remove_lock_file(lock_path) + backed_off = True + if asyncio.get_running_loop().time() >= deadline: + if not backed_off: + await self._remove_lock_file(lock_path) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + if not await self._verify_lock_ownership(lock_path, owner_id): + logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}") + if asyncio.get_running_loop().time() >= deadline: + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + owner.add_lock(lock_path) + logger.debug(f"[SUBTREE] Lock acquired: {lock_path}") return True - except Exception as e: - logger.error(f"Failed to acquire RM locks: {e}") - # Step 5: Release all acquired locks in reverse order - for lock in reversed(acquired_locks): - await self._remove_lock_file(lock) - return False - async def acquire_mv( self, src_path: str, - dst_path: str, - transaction: TransactionRecord, - max_parallel: int = 8, + dst_parent_path: str, + owner: LockOwner, + timeout: float = 0.0, + src_is_dir: bool = True, ) -> bool: - """Acquire path lock for mv operation. - - Lock acquisition flow for mv operations: - 1. Lock source directory (using RM-style locking) - 2. Lock destination directory (using normal locking) + """Acquire locks for a move operation. Args: - src_path: Source directory path - dst_path: Destination directory path - transaction: Transaction record - max_parallel: Maximum number of parallel lock operations - - Returns: - True if all locks acquired successfully, False otherwise + src_path: Source path to lock. + dst_parent_path: Parent directory of the destination to lock. + Callers typically pass the destination's parent so that the + lock covers sibling-level conflicts without requiring the + target to exist yet. + owner: Lock owner handle. + timeout: Maximum seconds to wait for each lock. + src_is_dir: Whether the source is a directory (SUBTREE lock) + or a file (POINT lock on parent). """ - # Step 1: Lock source directory - if not await self.acquire_rm(src_path, transaction, max_parallel): - logger.warning(f"Failed to lock source path: {src_path}") - return False - - # Step 2: Lock destination directory - if not await self.acquire_normal(dst_path, transaction): - logger.warning(f"Failed to lock destination path: {dst_path}") - # Release source locks - await self.release(transaction) - return False + if src_is_dir: + if not await self.acquire_subtree(src_path, owner, timeout=timeout): + logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}") + return False + if not await self.acquire_subtree(dst_parent_path, owner, timeout=timeout): + logger.warning( + f"[MV] Failed to acquire SUBTREE lock on destination parent: {dst_parent_path}" + ) + await self.release(owner) + return False + else: + src_parent = src_path.rsplit("/", 1)[0] if "/" in src_path else src_path + if not await self.acquire_point(src_parent, owner, timeout=timeout): + logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}") + return False + if not await self.acquire_point(dst_parent_path, owner, timeout=timeout): + logger.warning( + f"[MV] Failed to acquire POINT lock on destination parent: {dst_parent_path}" + ) + await self.release(owner) + return False - logger.debug(f"MV locks acquired: {src_path} -> {dst_path}") + logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_parent_path}") return True - async def release(self, transaction: TransactionRecord) -> None: - """Release all locks held by the transaction. - - Args: - transaction: Transaction record - """ - # Release locks in reverse order (LIFO) - for lock_path in reversed(transaction.locks): + async def refresh(self, owner: LockOwner) -> None: + """Rewrite all lock file timestamps to prevent stale cleanup.""" + for lock_path in list(owner.locks): + token = self._read_token(lock_path) + if token: + parsed_owner_id, _, lock_type = _parse_fencing_token(token) + if parsed_owner_id == owner.id: + new_token = _make_fencing_token(owner.id, lock_type) + try: + self._agfs.write(lock_path, new_token.encode("utf-8")) + except Exception as e: + logger.warning(f"Failed to refresh lock {lock_path}: {e}") + + async def release(self, owner: LockOwner) -> None: + lock_count = len(owner.locks) + for lock_path in reversed(owner.locks): await self._remove_lock_file(lock_path) - transaction.remove_lock(lock_path) + owner.remove_lock(lock_path) - logger.debug(f"Released {len(transaction.locks)} locks for transaction {transaction.id}") + logger.debug(f"Released {lock_count} locks for owner {owner.id}") diff --git a/openviking/storage/transaction/redo_log.py b/openviking/storage/transaction/redo_log.py new file mode 100644 index 00000000..80d07dff --- /dev/null +++ b/openviking/storage/transaction/redo_log.py @@ -0,0 +1,76 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Lightweight redo log for crash recovery of session_memory operations.""" + +import json +from typing import Any, Dict, List + +from openviking.pyagfs import AGFSClient +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + +_REDO_ROOT = "/local/_system/redo" + + +class RedoLog: + """Lightweight pending-task marker. + + Write a marker before the operation starts; delete it after success. + On startup, scan for leftover markers and redo. + """ + + def __init__(self, agfs: AGFSClient): + self._agfs = agfs + + def _task_path(self, task_id: str) -> str: + return f"{_REDO_ROOT}/{task_id}/redo.json" + + def _ensure_dirs(self, dir_path: str) -> None: + parts = dir_path.strip("/").split("/") + current = "" + for part in parts: + current = f"{current}/{part}" + try: + self._agfs.mkdir(current) + except Exception: + pass + + def write_pending(self, task_id: str, info: Dict[str, Any]) -> None: + """Write a redo marker before the operation starts.""" + dir_path = f"{_REDO_ROOT}/{task_id}" + self._ensure_dirs(dir_path) + data = json.dumps(info, default=str).encode("utf-8") + self._agfs.write(self._task_path(task_id), data) + + def mark_done(self, task_id: str) -> None: + """Delete the redo marker after a successful operation.""" + try: + self._agfs.rm(f"{_REDO_ROOT}/{task_id}", recursive=True) + except Exception as e: + logger.warning(f"Failed to clean redo marker {task_id}: {e}") + + def list_pending(self) -> List[str]: + """Return all pending task IDs (directories under _REDO_ROOT).""" + try: + entries = self._agfs.ls(_REDO_ROOT) + if not isinstance(entries, list): + return [] + return [ + e["name"] + for e in entries + if isinstance(e, dict) and e.get("isDir") and e.get("name") not in (".", "..") + ] + except Exception: + return [] + + def read(self, task_id: str) -> Dict[str, Any]: + """Read the info dict of a pending task.""" + try: + content = self._agfs.cat(self._task_path(task_id)) + if isinstance(content, bytes): + content = content.decode("utf-8") + return json.loads(content) + except Exception as e: + logger.warning(f"Failed to read redo info for {task_id}: {e}") + return {} diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py deleted file mode 100644 index da76cde7..00000000 --- a/openviking/storage/transaction/transaction_manager.py +++ /dev/null @@ -1,374 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Transaction manager for OpenViking. - -Global singleton that manages transaction lifecycle and lock mechanisms. -""" - -import asyncio -import threading -import time -from typing import Any, Dict, Optional - -from openviking.pyagfs import AGFSClient -from openviking.storage.transaction.path_lock import PathLock -from openviking.storage.transaction.transaction_record import ( - TransactionRecord, - TransactionStatus, -) -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - -# Global singleton instance -_transaction_manager: Optional["TransactionManager"] = None -_lock = threading.Lock() - - -class TransactionManager: - """Transaction manager for OpenViking. - - Global singleton that manages transaction lifecycle and lock mechanisms. - Responsible for: - - Allocating transaction IDs - - Managing transaction lifecycle (start, commit, rollback) - - Providing transaction lock mechanism interface, preventing deadlocks - """ - - def __init__( - self, - agfs_client: AGFSClient, - timeout: int = 3600, - max_parallel_locks: int = 8, - ): - """Initialize transaction manager. - - Args: - agfs_client: AGFS client for file system operations - timeout: Transaction timeout in seconds (default: 3600) - max_parallel_locks: Maximum number of parallel lock operations (default: 8) - """ - self._agfs = agfs_client - self._timeout = timeout - self._max_parallel_locks = max_parallel_locks - self._path_lock = PathLock(agfs_client) - - # Active transactions: {transaction_id: TransactionRecord} - self._transactions: Dict[str, TransactionRecord] = {} - - # Background task for timeout cleanup - self._cleanup_task: Optional[asyncio.Task] = None - self._running = False - - logger.info( - f"TransactionManager initialized (timeout={timeout}s, max_parallel_locks={max_parallel_locks})" - ) - - async def start(self) -> None: - """Start transaction manager. - - Starts the background cleanup task for timed-out transactions. - """ - if self._running: - logger.debug("TransactionManager already running") - return - - self._running = True - self._cleanup_task = asyncio.create_task(self._cleanup_loop()) - logger.info("TransactionManager started") - - def stop(self) -> None: - """Stop transaction manager. - - Stops the background cleanup task and releases all resources. - """ - if not self._running: - logger.debug("TransactionManager already stopped") - return - - self._running = False - - # Cancel cleanup task - if self._cleanup_task: - self._cleanup_task.cancel() - self._cleanup_task = None - - # Release all active transactions - for tx_id in list(self._transactions.keys()): - self._transactions.pop(tx_id, None) - - logger.info("TransactionManager stopped") - - async def _cleanup_loop(self) -> None: - """Background loop for cleaning up timed-out transactions.""" - while self._running: - try: - await asyncio.sleep(60) # Check every minute - await self._cleanup_timed_out() - except asyncio.CancelledError: - break - except Exception as e: - logger.error(f"Error in cleanup loop: {e}") - - async def _cleanup_timed_out(self) -> None: - """Clean up timed-out transactions.""" - current_time = time.time() - timed_out = [] - - for tx_id, tx in self._transactions.items(): - if current_time - tx.updated_at > self._timeout: - timed_out.append(tx_id) - - for tx_id in timed_out: - logger.warning(f"Transaction timed out: {tx_id}") - await self.rollback(tx_id) - - def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord: - """Create a new transaction. - - Args: - init_info: Transaction initialization information - - Returns: - New transaction record - """ - tx = TransactionRecord(init_info=init_info or {}) - self._transactions[tx.id] = tx - logger.debug(f"Transaction created: {tx.id}") - return tx - - def get_transaction(self, transaction_id: str) -> Optional[TransactionRecord]: - """Get transaction by ID. - - Args: - transaction_id: Transaction ID - - Returns: - Transaction record or None if not found - """ - return self._transactions.get(transaction_id) - - async def begin(self, transaction_id: str) -> bool: - """Begin a transaction. - - Args: - transaction_id: Transaction ID - - Returns: - True if transaction started successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.AQUIRE) - logger.debug(f"Transaction begun: {transaction_id}") - return True - - async def commit(self, transaction_id: str) -> bool: - """Commit a transaction. - - Args: - transaction_id: Transaction ID - - Returns: - True if transaction committed successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - # Update status to COMMIT - tx.update_status(TransactionStatus.COMMIT) - - # Release all locks - tx.update_status(TransactionStatus.RELEASING) - await self._path_lock.release(tx) - - # Update status to RELEASED - tx.update_status(TransactionStatus.RELEASED) - - # Remove from active transactions - self._transactions.pop(transaction_id, None) - - logger.debug(f"Transaction committed: {transaction_id}") - return True - - async def rollback(self, transaction_id: str) -> bool: - """Rollback a transaction. - - Args: - transaction_id: Transaction ID - - Returns: - True if transaction rolled back successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - # Update status to FAIL - tx.update_status(TransactionStatus.FAIL) - - # Release all locks - tx.update_status(TransactionStatus.RELEASING) - await self._path_lock.release(tx) - - # Update status to RELEASED - tx.update_status(TransactionStatus.RELEASED) - - # Remove from active transactions - self._transactions.pop(transaction_id, None) - - logger.debug(f"Transaction rolled back: {transaction_id}") - return True - - async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool: - """Acquire path lock for normal (non-rm/mv) operations. - - Args: - transaction_id: Transaction ID - path: Directory path to lock - - Returns: - True if lock acquired successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.AQUIRE) - success = await self._path_lock.acquire_normal(path, tx) - - if success: - tx.update_status(TransactionStatus.EXEC) - else: - tx.update_status(TransactionStatus.FAIL) - - return success - - async def acquire_lock_rm( - self, transaction_id: str, path: str, max_parallel: Optional[int] = None - ) -> bool: - """Acquire path lock for rm operation. - - Args: - transaction_id: Transaction ID - path: Directory path to lock - max_parallel: Maximum number of parallel lock operations (default: from config) - - Returns: - True if lock acquired successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.AQUIRE) - parallel = max_parallel or self._max_parallel_locks - success = await self._path_lock.acquire_rm(path, tx, parallel) - - if success: - tx.update_status(TransactionStatus.EXEC) - else: - tx.update_status(TransactionStatus.FAIL) - - return success - - async def acquire_lock_mv( - self, - transaction_id: str, - src_path: str, - dst_path: str, - max_parallel: Optional[int] = None, - ) -> bool: - """Acquire path lock for mv operation. - - Args: - transaction_id: Transaction ID - src_path: Source directory path - dst_path: Destination directory path - max_parallel: Maximum number of parallel lock operations (default: from config) - - Returns: - True if lock acquired successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.AQUIRE) - parallel = max_parallel or self._max_parallel_locks - success = await self._path_lock.acquire_mv(src_path, dst_path, tx, parallel) - - if success: - tx.update_status(TransactionStatus.EXEC) - else: - tx.update_status(TransactionStatus.FAIL) - - return success - - def get_active_transactions(self) -> Dict[str, TransactionRecord]: - """Get all active transactions. - - Returns: - Dictionary of active transactions {transaction_id: TransactionRecord} - """ - return self._transactions.copy() - - def get_transaction_count(self) -> int: - """Get the number of active transactions. - - Returns: - Number of active transactions - """ - return len(self._transactions) - - -def init_transaction_manager( - agfs: AGFSClient, - tx_timeout: int = 3600, - max_parallel_locks: int = 8, -) -> TransactionManager: - """Initialize transaction manager singleton. - - Args: - agfs: AGFS client instance - tx_timeout: Transaction timeout in seconds (default: 3600) - max_parallel_locks: Maximum number of parallel lock operations (default: 8) - - Returns: - TransactionManager instance - """ - global _transaction_manager - - with _lock: - if _transaction_manager is not None: - logger.debug("TransactionManager already initialized") - return _transaction_manager - - # Create transaction manager - _transaction_manager = TransactionManager( - agfs_client=agfs, - timeout=tx_timeout, - max_parallel_locks=max_parallel_locks, - ) - - logger.info("TransactionManager initialized as singleton") - return _transaction_manager - - -def get_transaction_manager() -> Optional[TransactionManager]: - """Get transaction manager singleton. - - Returns: - TransactionManager instance or None if not initialized - """ - return _transaction_manager diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py deleted file mode 100644 index fba6480b..00000000 --- a/openviking/storage/transaction/transaction_record.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Transaction record and status definitions. - -Defines the data structures for tracking transaction lifecycle and state. -""" - -import time -import uuid -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Dict, List - - -class TransactionStatus(str, Enum): - """Transaction status enumeration. - - Status machine: INIT -> AQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED - """ - - INIT = "INIT" # Transaction initialized, waiting for lock acquisition - AQUIRE = "AQUIRE" # Acquiring lock resources - EXEC = "EXEC" # Transaction operation in progress - COMMIT = "COMMIT" # Transaction completed successfully - FAIL = "FAIL" # Transaction failed - RELEASING = "RELEASING" # Releasing lock resources - RELEASED = "RELEASED" # Lock resources fully released, transaction ended - - def __str__(self) -> str: - return self.value - - -@dataclass -class TransactionRecord: - """Transaction record for tracking transaction lifecycle. - - Attributes: - id: Transaction ID in UUID format, uniquely identifies a transaction - locks: List of lock paths held by this transaction - status: Current transaction status - init_info: Transaction initialization information - rollback_info: Information for rollback operations - created_at: Creation timestamp (Unix timestamp in seconds) - updated_at: Last update timestamp (Unix timestamp in seconds) - """ - - id: str = field(default_factory=lambda: str(uuid.uuid4())) - locks: List[str] = field(default_factory=list) - status: TransactionStatus = field(default=TransactionStatus.INIT) - init_info: Dict[str, Any] = field(default_factory=dict) - rollback_info: Dict[str, Any] = field(default_factory=dict) - created_at: float = field(default_factory=time.time) - updated_at: float = field(default_factory=time.time) - - def update_status(self, status: TransactionStatus) -> None: - """Update transaction status and timestamp. - - Args: - status: New transaction statusudi - """ - self.status = status - self.updated_at = time.time() - - def add_lock(self, lock_path: str) -> None: - """Add a lock to the transaction. - - Args: - lock_path: Path to be locked - """ - if lock_path not in self.locks: - self.locks.append(lock_path) - self.updated_at = time.time() - - def remove_lock(self, lock_path: str) -> None: - """Remove a lock from the transaction. - - Args: - lock_path: Path to be unlocked - """ - if lock_path in self.locks: - self.locks.remove(lock_path) - self.updated_at = time.time() - - def to_dict(self) -> Dict[str, Any]: - """Convert transaction record to dictionary. - - Returns: - Dictionary representation of the transaction record - """ - return { - "id": self.id, - "locks": self.locks, - "status": str(self.status), - "init_info": self.init_info, - "rollback_info": self.rollback_info, - "created_at": self.created_at, - "updated_at": self.updated_at, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord": - """Create transaction record from dictionary. - - Args: - data: Dictionary representation of the transaction record - - Returns: - TransactionRecord instance - """ - status_str = data.get("status", "INIT") - status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str - - return cls( - id=data.get("id", str(uuid.uuid4())), - locks=data.get("locks", []), - status=status, - init_info=data.get("init_info", {}), - rollback_info=data.get("rollback_info", {}), - created_at=data.get("created_at", time.time()), - updated_at=data.get("updated_at", time.time()), - ) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 7c45f544..f8b8a356 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -22,7 +22,6 @@ from pathlib import PurePath from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union -from openviking.pyagfs.exceptions import AGFSHTTPError from openviking.server.identity import RequestContext, Role from openviking.telemetry import get_current_telemetry from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime @@ -289,15 +288,47 @@ async def rm( This method is idempotent: deleting a non-existent file succeeds after cleaning up any orphan index records. + + Acquires a path lock, deletes VectorDB records, then FS files. + Raises ResourceBusyError when the target is locked by an ongoing + operation (e.g. semantic processing). """ + from openviking.storage.errors import LockAcquisitionError, ResourceBusyError + from openviking.storage.transaction import LockContext, get_lock_manager + self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) target_uri = self._path_to_uri(path, ctx=ctx) - uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) - uris_to_delete.append(target_uri) - result = self.agfs.rm(path, recursive=recursive) - await self._delete_from_vector_store(uris_to_delete, ctx=ctx) - return result + + # Check existence and determine lock strategy + try: + stat = self.agfs.stat(path) + is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False + except Exception: + # Path does not exist: clean up any orphan index records and return + uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) + uris_to_delete.append(target_uri) + await self._delete_from_vector_store(uris_to_delete, ctx=ctx) + logger.info(f"[VikingFS] rm target not found, cleaned orphan index: {uri}") + return {} + + if is_dir: + lock_paths = [path] + lock_mode = "subtree" + else: + parent = path.rsplit("/", 1)[0] if "/" in path else path + lock_paths = [parent] + lock_mode = "point" + + try: + async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode): + uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) + uris_to_delete.append(target_uri) + await self._delete_from_vector_store(uris_to_delete, ctx=ctx) + result = self.agfs.rm(path, recursive=recursive) + return result + except LockAcquisitionError: + raise ResourceBusyError(f"Resource is being processed: {uri}") async def mv( self, @@ -305,24 +336,69 @@ async def mv( new_uri: str, ctx: Optional[RequestContext] = None, ) -> Dict[str, Any]: - """Move file/directory + recursively update vector index.""" + """Move file/directory + recursively update vector index. + + Implemented as cp + rm to avoid lock files being carried by FS mv. + On VectorDB update failure the copy is cleaned up so the source stays intact. + """ + from openviking.pyagfs.helpers import cp as agfs_cp + from openviking.storage.transaction import LockContext, get_lock_manager + self._ensure_access(old_uri, ctx) self._ensure_access(new_uri, ctx) old_path = self._uri_to_path(old_uri, ctx=ctx) new_path = self._uri_to_path(new_uri, ctx=ctx) target_uri = self._path_to_uri(old_path, ctx=ctx) - uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) - uris_to_move.append(target_uri) + # Verify source exists and determine type before locking try: - result = self.agfs.mv(old_path, new_path) - await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) - return result - except AGFSHTTPError as e: - if e.status_code == 404: - await self._delete_from_vector_store(uris_to_move, ctx=ctx) - logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") - raise + stat = self.agfs.stat(old_path) + is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False + except Exception: + raise FileNotFoundError(f"mv source not found: {old_uri}") + + dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path + + async with LockContext( + get_lock_manager(), + [old_path], + lock_mode="mv", + mv_dst_parent_path=dst_parent, + src_is_dir=is_dir, + ): + uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) + uris_to_move.append(target_uri) + + # Copy source to destination (source still intact) + try: + agfs_cp(self.agfs, old_path, new_path, recursive=is_dir) + except Exception as e: + if "not found" in str(e).lower(): + await self._delete_from_vector_store(uris_to_move, ctx=ctx) + logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") + raise + + # Remove carried lock file from the copy (directory only) + if is_dir: + carried_lock = new_path.rstrip("/") + "/.path.ovlock" + try: + self.agfs.rm(carried_lock) + except Exception: + pass + + # Update VectorDB URIs (on failure, clean up the copy) + try: + await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) + except Exception: + try: + self.agfs.rm(new_path, recursive=is_dir) + except Exception: + pass + raise + + # Delete source + self.agfs.rm(old_path, recursive=is_dir) + return {} async def grep( self, @@ -940,20 +1016,20 @@ def _uri_to_path(self, uri: str, ctx: Optional[RequestContext] = None) -> str: safe_parts = [self._shorten_component(p, self._MAX_FILENAME_BYTES) for p in parts] return f"/local/{account_id}/{'/'.join(safe_parts)}" - _INTERNAL_DIRS = {"_system"} + _INTERNAL_NAMES = {"_system", ".path.ovlock"} _ROOT_PATH = "/local" def _ls_entries(self, path: str) -> List[Dict[str, Any]]: """List directory entries, filtering out internal directories. At account root (/local/{account}), uses VALID_SCOPES whitelist. - At other levels, uses _INTERNAL_DIRS blacklist. + At other levels, uses _INTERNAL_NAMES blacklist. """ entries = self.agfs.ls(path) parts = [p for p in path.strip("/").split("/") if p] if len(parts) == 2 and parts[0] == "local": return [e for e in entries if e.get("name") in VikingURI.VALID_SCOPES] - return [e for e in entries if e.get("name") not in self._INTERNAL_DIRS] + return [e for e in entries if e.get("name") not in self._INTERNAL_NAMES] def _path_to_uri(self, path: str, ctx: Optional[RequestContext] = None) -> str: """/local/{account}/... -> viking://... @@ -1011,7 +1087,7 @@ def _is_accessible(self, uri: str, ctx: RequestContext) -> bool: return True scope = parts[0] - if scope in {"resources", "temp", "transactions"}: + if scope in {"resources", "temp"}: return True if scope == "_system": return False @@ -1071,19 +1147,6 @@ def _handle_agfs_content(self, result: Union[bytes, Any, None]) -> str: return str(result) except Exception: return "" - """Handle AGFSClient content return types consistently.""" - if isinstance(result, bytes): - return result.decode("utf-8") - elif hasattr(result, "content"): - return result.content.decode("utf-8") - elif result is None: - return "" - else: - # Try to convert to string - try: - return str(result) - except Exception: - return "" def _infer_context_type(self, uri: str): """Infer context_type from URI. Returns None when ambiguous.""" @@ -1296,6 +1359,12 @@ async def read_file( """ self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) + # Verify the file exists before reading, because AGFS read returns + # empty bytes for non-existent files instead of raising an error. + try: + self.agfs.stat(path) + except Exception: + raise NotFoundError(uri, "file") try: content = self.agfs.read(path) except Exception: diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index 880af30b..29026d49 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -882,13 +882,21 @@ def _seed_uri_for_id(uri: str, level: int) -> str: async def increment_active_count(self, ctx: RequestContext, uris: List[str]) -> int: updated = 0 for uri in uris: - records = await self.get_context_by_uri(uri=uri, limit=1, ctx=ctx) + records = await self.get_context_by_uri(uri=uri, limit=100, ctx=ctx) if not records: continue - record = records[0] - current = int(record.get("active_count", 0) or 0) - record["active_count"] = current + 1 - if await self.upsert(record, ctx=ctx): + record_ids = [r["id"] for r in records if r.get("id")] + if not record_ids: + continue + # Re-fetch by ID to get full records including vectors + full_records = await self.get(record_ids, ctx=ctx) + uri_updated = False + for record in full_records: + current = int(record.get("active_count", 0) or 0) + record["active_count"] = current + 1 + if await self.upsert(record, ctx=ctx): + uri_updated = True + if uri_updated: updated += 1 return updated diff --git a/openviking/utils/agfs_utils.py b/openviking/utils/agfs_utils.py index 511cf2d0..8db073d9 100644 --- a/openviking/utils/agfs_utils.py +++ b/openviking/utils/agfs_utils.py @@ -99,6 +99,10 @@ def mount_agfs_backend(agfs: Any, agfs_config: Any) -> None: local_dir = plugin_config["config"]["local_dir"] os.makedirs(local_dir, exist_ok=True) logger.debug(f"[AGFSUtils] Ensured local directory exists: {local_dir}") + # Ensure queuefs db_path parent directory exists before mounting + if plugin_name == "queuefs" and "db_path" in plugin_config.get("config", {}): + db_path = plugin_config["config"]["db_path"] + os.makedirs(os.path.dirname(db_path), exist_ok=True) try: agfs.unmount(mount_path) diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index bd56a2ea..adcb4245 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -7,6 +7,7 @@ as described in the OpenViking design document. """ +import asyncio import time from typing import TYPE_CHECKING, Any, Dict, List, Optional @@ -209,39 +210,89 @@ async def process_resource( return result + # ============ Phase 3.5: 首次添加立即落盘 + 生命周期锁 ============ + root_uri = result.get("root_uri") + temp_uri = result.get("temp_uri") # temp_doc_uri + candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None + lifecycle_lock_handle_id = "" + + if root_uri and temp_uri: + from openviking.storage.transaction import LockContext, get_lock_manager + + viking_fs = get_viking_fs() + lock_manager = get_lock_manager() + target_exists = await viking_fs.exists(root_uri, ctx=ctx) + + if not target_exists: + # 第一次添加:锁保护下将 temp 移到 final + dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx) + parent_path = dst_path.rsplit("/", 1)[0] if "/" in dst_path else dst_path + + parent_uri = "/".join(root_uri.rsplit("/", 1)[:-1]) + if parent_uri: + await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx) + + async with LockContext(lock_manager, [parent_path], lock_mode="point"): + if candidate_uri: + root_uri = await self.tree_builder._resolve_unique_uri(candidate_uri) + result["root_uri"] = root_uri + dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx) + + src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx) + await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path) + + # 在 POINT 锁内获取 SUBTREE 锁(消除竞态窗口) + lifecycle_lock_handle_id = await self._try_acquire_lifecycle_lock( + lock_manager, dst_path + ) + + try: + await viking_fs.delete_temp(parse_result.temp_dir_path, ctx=ctx) + except Exception: + pass + + result["temp_uri"] = root_uri + else: + # 增量更新:对目标目录加 SUBTREE 锁 + resource_path = viking_fs._uri_to_path(root_uri, ctx=ctx) + lifecycle_lock_handle_id = await self._try_acquire_lifecycle_lock( + lock_manager, resource_path + ) + # ============ Phase 4: Optional Steps ============ build_index = kwargs.get("build_index", True) temp_uri_for_summarize = result.get("temp_uri") or parse_result.temp_dir_path - if summarize: - # Explicit summarization request. - # If build_index is ALSO True, we want vectorization. - # If build_index is False, we skip vectorization. + should_summarize = summarize or build_index + if should_summarize: skip_vec = not build_index try: await self._get_summarizer().summarize( resource_uris=[result["root_uri"]], ctx=ctx, skip_vectorization=skip_vec, + lifecycle_lock_handle_id=lifecycle_lock_handle_id, temp_uris=[temp_uri_for_summarize], **kwargs, ) except Exception as e: logger.error(f"Summarization failed: {e}") result["warnings"] = result.get("warnings", []) + [f"Summarization failed: {e}"] + elif lifecycle_lock_handle_id: + # 无下游处理接管锁,主动释放 + from openviking.storage.transaction import get_lock_manager - elif build_index: - # Standard compatibility mode: "Just Index it" usually implies ingestion flow. - # We assume this means "Ingest and Index", which requires summarization. - try: - await self._get_summarizer().summarize( - resource_uris=[result["root_uri"]], - ctx=ctx, - skip_vectorization=False, - temp_uris=[temp_uri_for_summarize], - **kwargs, - ) - except Exception as e: - logger.error(f"Auto-index failed: {e}") - result["warnings"] = result.get("warnings", []) + [f"Auto-index failed: {e}"] + handle = get_lock_manager().get_handle(lifecycle_lock_handle_id) + if handle: + await get_lock_manager().release(handle) return result + + @staticmethod + async def _try_acquire_lifecycle_lock(lock_manager, path: str) -> str: + """尝试获取 SUBTREE 生命周期锁,失败时优雅降级返回空字符串。""" + handle = lock_manager.create_handle() + if await lock_manager.acquire_subtree(handle, path): + return handle.id + logger.warning(f"[ResourceProcessor] Failed to acquire lifecycle lock on {path}") + await lock_manager.release(handle) + return "" diff --git a/openviking/utils/summarizer.py b/openviking/utils/summarizer.py index 22076f92..e9a1cb20 100644 --- a/openviking/utils/summarizer.py +++ b/openviking/utils/summarizer.py @@ -31,6 +31,7 @@ async def summarize( resource_uris: List[str], ctx: "RequestContext", skip_vectorization: bool = False, + lifecycle_lock_handle_id: str = "", **kwargs, ) -> Dict[str, Any]: """ @@ -71,7 +72,8 @@ async def summarize( role=ctx.role.value, skip_vectorization=skip_vectorization, telemetry_id=telemetry.telemetry_id if telemetry.enabled else "", - target_uri=uri, + target_uri=uri if uri != temp_uri else None, + lifecycle_lock_handle_id=lifecycle_lock_handle_id, ) await semantic_queue.enqueue(msg) enqueued_count += 1 diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py index 422a1be4..739c1af6 100644 --- a/openviking_cli/utils/config/storage_config.py +++ b/openviking_cli/utils/config/storage_config.py @@ -8,6 +8,7 @@ from openviking_cli.utils.logger import get_logger from .agfs_config import AGFSConfig +from .transaction_config import TransactionConfig from .vectordb_config import VectorDBBackendConfig logger = get_logger(__name__) @@ -25,6 +26,11 @@ class StorageConfig(BaseModel): agfs: AGFSConfig = Field(default_factory=lambda: AGFSConfig(), description="AGFS configuration") + transaction: TransactionConfig = Field( + default_factory=lambda: TransactionConfig(), + description="Transaction mechanism configuration", + ) + vectordb: VectorDBBackendConfig = Field( default_factory=lambda: VectorDBBackendConfig(), description="VectorDB backend configuration", diff --git a/openviking_cli/utils/config/transaction_config.py b/openviking_cli/utils/config/transaction_config.py new file mode 100644 index 00000000..86d153f8 --- /dev/null +++ b/openviking_cli/utils/config/transaction_config.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel, Field + + +class TransactionConfig(BaseModel): + """Configuration for the transaction mechanism. + + By default, lock acquisition does not wait (``lock_timeout=0``): if a + conflicting lock is held the operation fails immediately with + ``LockAcquisitionError``. Set ``lock_timeout`` to a positive value to + allow the caller to block and retry for up to that many seconds. + """ + + lock_timeout: float = Field( + default=0.0, + description=( + "Path lock acquisition timeout (seconds). " + "0 = fail immediately if locked (default). " + "> 0 = wait/retry up to this many seconds before raising LockAcquisitionError." + ), + ) + + lock_expire: float = Field( + default=300.0, + description=( + "Stale lock expiry threshold (seconds). " + "Locks held longer than this by a crashed process are force-released." + ), + ) + + model_config = {"extra": "forbid"} diff --git a/tests/agfs/test_fs_binding.py b/tests/agfs/test_fs_binding.py index ed8d3d33..3e76ee8f 100644 --- a/tests/agfs/test_fs_binding.py +++ b/tests/agfs/test_fs_binding.py @@ -13,6 +13,7 @@ import pytest +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -32,16 +33,16 @@ async def viking_fs_binding_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) # make sure default/temp directory exists await vfs.mkdir("viking://temp/", exist_ok=True) - # Ensure test directory exists - await vfs.mkdir("viking://temp/", exist_ok=True) - yield vfs + reset_lock_manager() + @pytest.mark.asyncio class TestVikingFSBindingLocal: diff --git a/tests/agfs/test_fs_binding_s3.py b/tests/agfs/test_fs_binding_s3.py index 692b869d..802d4f6d 100644 --- a/tests/agfs/test_fs_binding_s3.py +++ b/tests/agfs/test_fs_binding_s3.py @@ -13,6 +13,7 @@ import pytest +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -57,11 +58,14 @@ async def viking_fs_binding_s3_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) yield vfs + reset_lock_manager() + @pytest.mark.asyncio class TestVikingFSBindingS3: diff --git a/tests/agfs/test_fs_local.py b/tests/agfs/test_fs_local.py index 3a428ed6..41ef0730 100644 --- a/tests/agfs/test_fs_local.py +++ b/tests/agfs/test_fs_local.py @@ -10,6 +10,7 @@ import pytest from openviking.agfs_manager import AGFSManager +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -39,13 +40,15 @@ async def viking_fs_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) # make sure default/temp directory exists await vfs.mkdir("viking://temp/", exist_ok=True) yield vfs + reset_lock_manager() # AGFSManager.stop is synchronous manager.stop() diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py index 330c7089..00504fad 100644 --- a/tests/agfs/test_fs_s3.py +++ b/tests/agfs/test_fs_s3.py @@ -13,6 +13,7 @@ import pytest from openviking.agfs_manager import AGFSManager +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import VikingFS, init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -46,7 +47,8 @@ def load_agfs_config() -> AGFSConfig: AGFS_CONF = load_agfs_config() -AGFS_CONF.mode = "http-client" +if AGFS_CONF is not None: + AGFS_CONF.mode = "http-client" # 2. Skip tests if no S3 config found or backend is not S3 pytestmark = pytest.mark.skipif( @@ -81,11 +83,13 @@ async def viking_fs_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) yield vfs + reset_lock_manager() # AGFSManager.stop is synchronous manager.stop() diff --git a/tests/client/test_file_operations.py b/tests/client/test_file_operations.py index 99415f20..a402e4af 100644 --- a/tests/client/test_file_operations.py +++ b/tests/client/test_file_operations.py @@ -8,6 +8,7 @@ import pytest from openviking import AsyncOpenViking +from openviking.storage.transaction import release_all_locks class TestRm: @@ -22,6 +23,7 @@ async def test_rm_file(self, client: AsyncOpenViking, sample_markdown_file: Path reason="Test rm", ) + await release_all_locks() uris = await client.tree(result["root_uri"]) for data in uris: if not data["isDir"]: @@ -35,7 +37,8 @@ async def test_rm_directory_recursive(self, client: AsyncOpenViking, sample_dire for f in sample_directory.glob("**/*.txt"): await client.add_resource(path=str(f), reason="Test rm dir") - # Get resource directory + # Release lifecycle locks held by add_resource before rm + await release_all_locks() entries = await client.ls("viking://resources/") for data in entries: if data["isDir"]: @@ -57,6 +60,7 @@ async def test_mv_file(self, client: AsyncOpenViking, sample_markdown_file: Path ) uri = result["root_uri"] new_uri = "viking://resources/moved/" + await release_all_locks() await client.mv(uri, new_uri) # Verify original location does not exist with pytest.raises(Exception): # noqa: B017 diff --git a/tests/client/test_import_export.py b/tests/client/test_import_export.py index e4dfe3a9..2aaac8f7 100644 --- a/tests/client/test_import_export.py +++ b/tests/client/test_import_export.py @@ -10,6 +10,7 @@ import pytest from openviking import AsyncOpenViking +from openviking.storage.transaction import release_all_locks class TestExportOvpack: @@ -99,6 +100,7 @@ async def test_import_export_roundtrip( await client.export_ovpack(original_uri, str(export_path)) # Delete original resource + await release_all_locks() await client.rm(original_uri, recursive=True) # Import diff --git a/tests/client/test_resource_management.py b/tests/client/test_resource_management.py index 98302fd7..f42ce132 100644 --- a/tests/client/test_resource_management.py +++ b/tests/client/test_resource_management.py @@ -49,7 +49,7 @@ async def test_add_resource_with_to(self, client: AsyncOpenViking, sample_markdo """Test adding resource to specified target""" result = await client.add_resource( path=str(sample_markdown_file), - to="viking://resources/custom/", + to="viking://resources/custom/sample", reason="Test resource", ) diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py index 32421e69..3da7cc25 100644 --- a/tests/integration/test_add_resource_index.py +++ b/tests/integration/test_add_resource_index.py @@ -1,10 +1,8 @@ -import pytest -import asyncio -import os import json -import shutil -from pathlib import Path -from unittest.mock import MagicMock, AsyncMock, patch +import os +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest from openviking.async_client import AsyncOpenViking from openviking_cli.utils.config.open_viking_config import OpenVikingConfigSingleton @@ -88,6 +86,20 @@ async def test_add_resource_indexing_logic(test_config, tmp_path): mock_agfs = MockLocalAGFS(root_path=tmp_path / "mock_agfs_root") + # Create mock parse result for Phase 1 (media processor) + mock_parse_result = MagicMock() + mock_parse_result.source_path = str(resource_file) + mock_parse_result.meta = {} + mock_parse_result.temp_dir_path = "/tmp/fake_temp_dir" + mock_parse_result.warnings = [] + mock_parse_result.source_format = "markdown" + + # Create mock context tree for Phase 2/3 (tree builder) + mock_context_tree = MagicMock() + mock_context_tree.root = MagicMock() + mock_context_tree.root.uri = "viking://resources/test_doc" + mock_context_tree.root.temp_uri = None + # Patch the Summarizer and IndexBuilder to verify calls with ( patch( @@ -96,6 +108,16 @@ async def test_add_resource_indexing_logic(test_config, tmp_path): patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs), patch("openviking.agfs_manager.AGFSManager.start"), patch("openviking.agfs_manager.AGFSManager.stop"), + patch( + "openviking.utils.media_processor.UnifiedResourceProcessor.process", + new_callable=AsyncMock, + return_value=mock_parse_result, + ), + patch( + "openviking.parse.tree_builder.TreeBuilder.finalize_from_temp", + new_callable=AsyncMock, + return_value=mock_context_tree, + ), ): mock_summarize.return_value = {"status": "success"} diff --git a/tests/integration/test_full_workflow.py b/tests/integration/test_full_workflow.py index 3f86b559..b48385d7 100644 --- a/tests/integration/test_full_workflow.py +++ b/tests/integration/test_full_workflow.py @@ -10,6 +10,7 @@ from openviking import AsyncOpenViking from openviking.message import TextPart +from openviking.storage.transaction import release_all_locks @pytest_asyncio.fixture(scope="function") @@ -67,11 +68,17 @@ async def test_add_search_read_workflow( # 3. Read searched resource if search_result.resources: - res = await client.tree(search_result.resources[0].uri) - for data in res: - if not data["isDir"]: - content = await client.read(data["uri"]) - assert len(content) > 0 + uri = search_result.resources[0].uri + info = await client.stat(uri) + if info.get("isDir"): + res = await client.tree(uri) + for data in res: + if not data["isDir"]: + content = await client.read(data["uri"]) + assert len(content) > 0 + else: + content = await client.read(uri) + assert len(content) > 0 class TestSessionWorkflow: @@ -165,6 +172,7 @@ async def test_export_import_roundtrip( assert export_path.exists() # 4. Delete original resource + await release_all_locks() await client.rm(original_uri, recursive=True) # 5. Import diff --git a/tests/misc/test_vikingdb_observer.py b/tests/misc/test_vikingdb_observer.py index 310d01b6..3dc3cfaf 100644 --- a/tests/misc/test_vikingdb_observer.py +++ b/tests/misc/test_vikingdb_observer.py @@ -8,13 +8,16 @@ import asyncio import openviking as ov +from openviking.async_client import AsyncOpenViking async def test_vikingdb_observer(): """Test VikingDBObserver functionality""" print("=== Test VikingDBObserver ===") - # Create client + # Reset singleton to ensure clean state from previous tests + await AsyncOpenViking.reset() + client = ov.AsyncOpenViking(path="./test_data/test_vikingdb_observer") try: @@ -72,15 +75,17 @@ async def test_vikingdb_observer(): traceback.print_exc() finally: - # Close client - await client.close() + await AsyncOpenViking.reset() print("Client closed") -def test_sync_client(): +async def test_sync_client(): """Test sync client""" print("\n=== Test sync client ===") + # Reset singleton to ensure clean state from previous tests + await AsyncOpenViking.reset() + client = ov.OpenViking(path="./test_data/test_vikingdb_observer") try: @@ -109,6 +114,7 @@ def test_sync_client(): finally: client.close() + await AsyncOpenViking.reset() print("Sync client closed") @@ -117,4 +123,4 @@ def test_sync_client(): asyncio.run(test_vikingdb_observer()) # Run sync test - test_sync_client() + asyncio.run(test_sync_client()) diff --git a/tests/retrieve/test_hierarchical_retriever_rerank.py b/tests/retrieve/test_hierarchical_retriever_rerank.py index ffaea6a8..a7ead7bc 100644 --- a/tests/retrieve/test_hierarchical_retriever_rerank.py +++ b/tests/retrieve/test_hierarchical_retriever_rerank.py @@ -19,7 +19,7 @@ def __init__(self) -> None: class DummyEmbedder: - def embed(self, _query: str) -> DummyEmbedResult: + def embed(self, _query: str, is_query: bool = False) -> DummyEmbedResult: return DummyEmbedResult() @@ -180,8 +180,8 @@ def test_merge_starting_points_prefers_rerank_scores_in_thinking_mode(monkeypatc "hello", ["viking://resources"], [ - {"uri": "viking://resources/root-a", "abstract": "root A", "_score": 0.2}, - {"uri": "viking://resources/root-b", "abstract": "root B", "_score": 0.8}, + {"uri": "viking://resources/root-a", "abstract": "root A", "_score": 0.2, "level": 1}, + {"uri": "viking://resources/root-b", "abstract": "root B", "_score": 0.8, "level": 1}, ], mode=RetrieverMode.THINKING, ) diff --git a/tests/server/conftest.py b/tests/server/conftest.py index 627798b4..3bc0e40f 100644 --- a/tests/server/conftest.py +++ b/tests/server/conftest.py @@ -20,8 +20,10 @@ from openviking.server.config import ServerConfig from openviking.server.identity import RequestContext, Role from openviking.service.core import OpenVikingService +from openviking.storage.transaction import reset_lock_manager from openviking_cli.session.user_id import UserIdentifier from openviking_cli.utils.config.embedding_config import EmbeddingConfig +from openviking_cli.utils.config.vlm_config import VLMConfig # --------------------------------------------------------------------------- # Paths @@ -54,11 +56,11 @@ class FakeEmbedder(DenseEmbedderBase): def __init__(self): super().__init__(model_name="test-fake-embedder") - def embed(self, text: str) -> EmbedResult: + def embed(self, text: str, is_query: bool = False) -> EmbedResult: return EmbedResult(dense_vector=[0.1] * dimension) - def embed_batch(self, texts: list[str]) -> list[EmbedResult]: - return [self.embed(text) for text in texts] + def embed_batch(self, texts: list[str], is_query: bool = False) -> list[EmbedResult]: + return [self.embed(text, is_query=is_query) for text in texts] def get_dimension(self) -> int: return dimension @@ -67,6 +69,20 @@ def get_dimension(self) -> int: return FakeEmbedder +def _install_fake_vlm(monkeypatch): + """Use a fake VLM so server tests never hit external LLM APIs.""" + + async def _fake_get_completion(self, prompt, thinking=False, max_retries=0): + return "# Test Summary\n\nFake summary for testing.\n\n## Details\nTest content." + + async def _fake_get_vision_completion(self, prompt, images, thinking=False): + return "Fake image description for testing." + + monkeypatch.setattr(VLMConfig, "is_available", lambda self: True) + monkeypatch.setattr(VLMConfig, "get_completion_async", _fake_get_completion) + monkeypatch.setattr(VLMConfig, "get_vision_completion_async", _fake_get_vision_completion) + + # --------------------------------------------------------------------------- # Core fixtures: service + app + async client (HTTP API tests, in-process) # --------------------------------------------------------------------------- @@ -94,7 +110,9 @@ def sample_markdown_file(temp_dir: Path) -> Path: @pytest_asyncio.fixture(scope="function") async def service(temp_dir: Path, monkeypatch): """Create and initialize an OpenVikingService in embedded mode.""" + reset_lock_manager() fake_embedder_cls = _install_fake_embedder(monkeypatch) + _install_fake_vlm(monkeypatch) svc = OpenVikingService( path=str(temp_dir / "data"), user=UserIdentifier.the_default_user("test_user") ) @@ -102,6 +120,7 @@ async def service(temp_dir: Path, monkeypatch): svc.viking_fs.query_embedder = fake_embedder_cls() yield svc await svc.close() + reset_lock_manager() @pytest_asyncio.fixture(scope="function") @@ -146,7 +165,9 @@ async def client_with_resource(client, service, sample_markdown_file): async def running_server(temp_dir: Path, monkeypatch): """Start a real uvicorn server in a background thread.""" await AsyncOpenViking.reset() + reset_lock_manager() fake_embedder_cls = _install_fake_embedder(monkeypatch) + _install_fake_vlm(monkeypatch) svc = OpenVikingService( path=str(temp_dir / "sdk_data"), user=UserIdentifier.the_default_user("sdk_test_user") diff --git a/tests/server/test_api_filesystem.py b/tests/server/test_api_filesystem.py index 79058d37..3a0da611 100644 --- a/tests/server/test_api_filesystem.py +++ b/tests/server/test_api_filesystem.py @@ -66,14 +66,6 @@ async def test_tree(client: httpx.AsyncClient): assert body["status"] == "ok" -async def test_stat_after_add_resource(client_with_resource): - client, uri = client_with_resource - resp = await client.get("/api/v1/fs/stat", params={"uri": uri}) - assert resp.status_code == 200 - body = resp.json() - assert body["status"] == "ok" - - async def test_stat_not_found(client: httpx.AsyncClient): resp = await client.get( "/api/v1/fs/stat", @@ -84,18 +76,28 @@ async def test_stat_not_found(client: httpx.AsyncClient): assert body["status"] == "error" -async def test_rm_resource(client_with_resource): +async def test_resource_ops(client_with_resource): + """Test stat, ls_recursive, mv, rm on a single shared resource.""" + import uuid + client, uri = client_with_resource - resp = await client.request("DELETE", "/api/v1/fs", params={"uri": uri, "recursive": True}) + + # stat + resp = await client.get("/api/v1/fs/stat", params={"uri": uri}) assert resp.status_code == 200 assert resp.json()["status"] == "ok" + # ls recursive + resp = await client.get( + "/api/v1/fs/ls", + params={"uri": "viking://", "recursive": True}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "ok" + assert isinstance(body["result"], list) -async def test_mv_resource(client_with_resource): - import uuid - - client, uri = client_with_resource - # Use a unique name to avoid conflicts with leftover data + # mv unique = uuid.uuid4().hex[:8] new_uri = uri.rstrip("/") + f"_mv_{unique}/" resp = await client.post( @@ -105,14 +107,7 @@ async def test_mv_resource(client_with_resource): assert resp.status_code == 200 assert resp.json()["status"] == "ok" - -async def test_ls_recursive(client_with_resource): - client, _ = client_with_resource - resp = await client.get( - "/api/v1/fs/ls", - params={"uri": "viking://", "recursive": True}, - ) + # rm (on the moved uri) + resp = await client.request("DELETE", "/api/v1/fs", params={"uri": new_uri, "recursive": True}) assert resp.status_code == 200 - body = resp.json() - assert body["status"] == "ok" - assert isinstance(body["result"], list) + assert resp.json()["status"] == "ok" diff --git a/tests/server/test_api_resources.py b/tests/server/test_api_resources.py index 48e214e8..6e591127 100644 --- a/tests/server/test_api_resources.py +++ b/tests/server/test_api_resources.py @@ -120,7 +120,7 @@ async def test_add_resource_with_to(client: httpx.AsyncClient, sample_markdown_f "/api/v1/resources", json={ "path": str(sample_markdown_file), - "to": "viking://resources/custom/", + "to": "viking://resources/custom/sample", "reason": "test resource", }, ) diff --git a/tests/server/test_api_search.py b/tests/server/test_api_search.py index 05d313fb..ce33773c 100644 --- a/tests/server/test_api_search.py +++ b/tests/server/test_api_search.py @@ -12,7 +12,7 @@ @pytest.fixture(autouse=True) def fake_query_embedder(service): class FakeEmbedder: - def embed(self, text: str) -> EmbedResult: + def embed(self, text: str, is_query: bool = False) -> EmbedResult: return EmbedResult(dense_vector=[0.1, 0.2, 0.3]) service.viking_fs.query_embedder = FakeEmbedder() diff --git a/tests/session/test_memory_dedup_actions.py b/tests/session/test_memory_dedup_actions.py index 844f9294..ac273965 100644 --- a/tests/session/test_memory_dedup_actions.py +++ b/tests/session/test_memory_dedup_actions.py @@ -42,7 +42,7 @@ def __init__(self, dense_vector): class _DummyEmbedder: - def embed(self, _text): + def embed(self, _text, is_query: bool = False): return _DummyEmbedResult([0.1, 0.2, 0.3]) @@ -78,7 +78,7 @@ def _make_compressor(vikingdb=None, embedder=None) -> SessionCompressor: """Create SessionCompressor without config dependency.""" vikingdb = vikingdb or MagicMock() with patch("openviking.session.memory_deduplicator.get_openviking_config") as mock_config: - mock_config.return_value.embedding.get_query_embedder.return_value = embedder + mock_config.return_value.embedding.get_embedder.return_value = embedder compressor = SessionCompressor(vikingdb=vikingdb) return compressor @@ -777,7 +777,7 @@ class _NoVLMConfig: class embedding: @staticmethod - def get_query_embedder(): + def get_embedder(): return _DummyEmbedder() fs = MagicMock() diff --git a/tests/session/test_session_commit.py b/tests/session/test_session_commit.py index 60a42d02..efa57fc7 100644 --- a/tests/session/test_session_commit.py +++ b/tests/session/test_session_commit.py @@ -6,9 +6,6 @@ from openviking import AsyncOpenViking from openviking.message import TextPart from openviking.session import Session -from tests.utils.mock_context import make_test_ctx - -ctx = make_test_ctx() class TestCommit: @@ -98,12 +95,14 @@ async def test_active_count_incremented_after_commit(self, client_with_resource_ """ client, uri = client_with_resource_sync vikingdb = client._client.service.vikingdb_manager + # Use the client's own context to match the account_id used when adding the resource + client_ctx = client._client._ctx # Look up the record by URI records_before = await vikingdb.get_context_by_uri( uri=uri, limit=1, - ctx=ctx, + ctx=client_ctx, ) assert records_before, f"Resource not found for URI: {uri}" count_before = records_before[0].get("active_count") or 0 @@ -121,7 +120,7 @@ async def test_active_count_incremented_after_commit(self, client_with_resource_ records_after = await vikingdb.get_context_by_uri( uri=uri, limit=1, - ctx=ctx, + ctx=client_ctx, ) assert records_after, f"Record disappeared after commit for URI: {uri}" count_after = records_after[0].get("active_count") or 0 diff --git a/tests/session/test_session_compressor_vikingdb.py b/tests/session/test_session_compressor_vikingdb.py index 71e22533..7cb38d02 100644 --- a/tests/session/test_session_compressor_vikingdb.py +++ b/tests/session/test_session_compressor_vikingdb.py @@ -15,8 +15,12 @@ async def test_delete_existing_memory_uses_vikingdb_manager(): compressor = SessionCompressor.__new__(SessionCompressor) compressor.vikingdb = AsyncMock() + compressor._pending_semantic_changes = {} viking_fs = AsyncMock() - memory = SimpleNamespace(uri="viking://user/user1/memories/events/e1") + memory = SimpleNamespace( + uri="viking://user/user1/memories/events/e1", + parent_uri="viking://user/user1/memories/events", + ) ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER) ok = await SessionCompressor._delete_existing_memory(compressor, memory, viking_fs, ctx) diff --git a/tests/storage/test_semantic_dag_skip_files.py b/tests/storage/test_semantic_dag_skip_files.py index 6b52fa4e..3eaeaa2f 100644 --- a/tests/storage/test_semantic_dag_skip_files.py +++ b/tests/storage/test_semantic_dag_skip_files.py @@ -1,6 +1,8 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 +from unittest.mock import AsyncMock, MagicMock + import pytest from openviking.server.identity import RequestContext, Role @@ -8,6 +10,23 @@ from openviking_cli.session.user_id import UserIdentifier +def _mock_transaction_layer(monkeypatch): + """Patch lock layer to no-op for DAG tests.""" + mock_handle = MagicMock() + monkeypatch.setattr( + "openviking.storage.transaction.lock_context.LockContext.__aenter__", + AsyncMock(return_value=mock_handle), + ) + monkeypatch.setattr( + "openviking.storage.transaction.lock_context.LockContext.__aexit__", + AsyncMock(return_value=False), + ) + monkeypatch.setattr( + "openviking.storage.transaction.get_lock_manager", + lambda: MagicMock(), + ) + + class _FakeVikingFS: def __init__(self, tree): self._tree = tree @@ -19,6 +38,9 @@ async def ls(self, uri, ctx=None): async def write_file(self, path, content, ctx=None): self.writes.append((path, content)) + def _uri_to_path(self, uri, ctx=None): + return uri.replace("viking://", "/local/acc1/") + class _FakeProcessor: def __init__(self): @@ -35,6 +57,9 @@ async def _generate_overview(self, dir_uri, file_summaries, children_abstracts): def _extract_abstract_from_overview(self, overview): return "abstract" + def _enforce_size_limits(self, overview, abstract): + return overview, abstract + async def _vectorize_directory( self, uri, context_type, abstract, overview, ctx=None, semantic_msg_id=None ): @@ -57,6 +82,7 @@ async def register(self, **_kwargs): @pytest.mark.asyncio async def test_messages_jsonl_excluded_from_summary(monkeypatch): """messages.jsonl should be skipped by _list_dir and never summarized.""" + _mock_transaction_layer(monkeypatch) root_uri = "viking://session/test-session" tree = { root_uri: [ @@ -91,6 +117,7 @@ async def test_messages_jsonl_excluded_from_summary(monkeypatch): @pytest.mark.asyncio async def test_messages_jsonl_excluded_in_subdirectory(monkeypatch): """messages.jsonl in a subdirectory should also be skipped.""" + _mock_transaction_layer(monkeypatch) root_uri = "viking://session/test-session" tree = { root_uri: [ diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py index 8cb8c1f8..94f9441f 100644 --- a/tests/storage/test_semantic_dag_stats.py +++ b/tests/storage/test_semantic_dag_stats.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio +from unittest.mock import AsyncMock, MagicMock import pytest @@ -21,6 +22,9 @@ async def ls(self, uri, ctx=None): async def write_file(self, path, content, ctx=None): self.writes.append((path, content)) + def _uri_to_path(self, uri, ctx=None): + return uri.replace("viking://", "/local/acc1/") + class _FakeProcessor: def __init__(self): @@ -36,6 +40,9 @@ async def _generate_overview(self, dir_uri, file_summaries, children_abstracts): def _extract_abstract_from_overview(self, overview): return "abstract" + def _enforce_size_limits(self, overview, abstract): + return overview, abstract + async def _vectorize_directory( self, uri, context_type, abstract, overview, ctx=None, semantic_msg_id=None ): @@ -75,6 +82,21 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch): lambda: _DummyTracker(), ) + # Mock lock layer: LockContext as no-op passthrough + mock_handle = MagicMock() + monkeypatch.setattr( + "openviking.storage.transaction.lock_context.LockContext.__aenter__", + AsyncMock(return_value=mock_handle), + ) + monkeypatch.setattr( + "openviking.storage.transaction.lock_context.LockContext.__aexit__", + AsyncMock(return_value=False), + ) + monkeypatch.setattr( + "openviking.storage.transaction.get_lock_manager", + lambda: MagicMock(), + ) + processor = _FakeProcessor() ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER) executor = SemanticDagExecutor( diff --git a/tests/test_session_task_tracking.py b/tests/test_session_task_tracking.py index 8a61fe4d..1306d500 100644 --- a/tests/test_session_task_tracking.py +++ b/tests/test_session_task_tracking.py @@ -181,7 +181,7 @@ async def test_task_failed_when_memory_extraction_raises(api_client): async def failing_extract(_context, _user, _session_id): raise RuntimeError("memory_extraction_failed: synthetic extractor error") - service.sessions._session_compressor.extractor.extract_strict = failing_extract + service.sessions._session_compressor.extractor.extract = failing_extract resp = await client.post(f"/api/v1/sessions/{session_id}/commit", params={"wait": False}) task_id = resp.json()["result"]["task_id"] diff --git a/tests/test_telemetry_runtime.py b/tests/test_telemetry_runtime.py index eecdbf04..d3700706 100644 --- a/tests/test_telemetry_runtime.py +++ b/tests/test_telemetry_runtime.py @@ -12,6 +12,7 @@ from openviking.server.identity import RequestContext, Role from openviking.service.resource_service import ResourceService from openviking.storage.collection_schemas import TextEmbeddingHandler +from openviking.storage.queuefs.semantic_dag import DagStats from openviking.storage.queuefs.semantic_msg import SemanticMsg from openviking.storage.queuefs.semantic_processor import SemanticProcessor from openviking.telemetry import ( @@ -134,15 +135,25 @@ class FakeVikingFS: async def ls(self, uri, ctx=None): return [] - async def fake_process_single_directory(**kwargs): - assert get_current_telemetry() is telemetry - get_current_telemetry().record_token_usage("llm", 11, 7) + class _FakeDagExecutor: + def __init__(self, **kwargs): + pass + + async def run(self, root_uri): + assert get_current_telemetry() is telemetry + get_current_telemetry().record_token_usage("llm", 11, 7) + + def get_stats(self): + return DagStats() monkeypatch.setattr( "openviking.storage.queuefs.semantic_processor.get_viking_fs", lambda: FakeVikingFS(), ) - monkeypatch.setattr(processor, "_process_single_directory", fake_process_single_directory) + monkeypatch.setattr( + "openviking.storage.queuefs.semantic_processor.SemanticDagExecutor", + lambda **kwargs: _FakeDagExecutor(**kwargs), + ) try: await processor.on_dequeue( @@ -185,7 +196,7 @@ def __init__(self): class _DummyVikingDB: is_closing = False - async def upsert(self, _data): + async def upsert(self, _data, *, ctx=None): return "rec-1" monkeypatch.setattr( diff --git a/tests/transaction/__init__.py b/tests/transaction/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py new file mode 100644 index 00000000..a0952289 --- /dev/null +++ b/tests/transaction/conftest.py @@ -0,0 +1,139 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Shared fixtures for transaction tests using real AGFS and VectorDB backends.""" + +import os +import shutil +import uuid + +import pytest + +from openviking.agfs_manager import AGFSManager +from openviking.server.identity import RequestContext, Role +from openviking.storage.collection_schemas import CollectionSchemas +from openviking.storage.transaction.lock_manager import LockManager +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _make_fencing_token +from openviking.storage.transaction.redo_log import RedoLog +from openviking.storage.viking_vector_index_backend import VikingVectorIndexBackend +from openviking.utils.agfs_utils import create_agfs_client +from openviking_cli.session.user_id import UserIdentifier +from openviking_cli.utils.config.agfs_config import AGFSConfig +from openviking_cli.utils.config.vectordb_config import VectorDBBackendConfig + +AGFS_CONF = AGFSConfig( + path="/tmp/ov-tx-test", backend="local", port=1834, url="http://localhost:1834", timeout=10 +) + +VECTOR_DIM = 4 +COLLECTION_NAME = "tx_test_ctx" + +# Clean slate before session starts +if os.path.exists(AGFS_CONF.path): + shutil.rmtree(AGFS_CONF.path) + + +@pytest.fixture(scope="session") +def agfs_manager(): + manager = AGFSManager(config=AGFS_CONF) + manager.start() + yield manager + manager.stop() + + +@pytest.fixture(scope="session") +def agfs_client(agfs_manager): + return create_agfs_client(AGFS_CONF) + + +def _mkdir_ok(agfs_client, path): + """Create directory, ignoring already-exists errors.""" + try: + agfs_client.mkdir(path) + except Exception: + pass # already exists + + +@pytest.fixture +def test_dir(agfs_client): + path = f"/local/tx-tests/{uuid.uuid4().hex}" + _mkdir_ok(agfs_client, "/local") + _mkdir_ok(agfs_client, "/local/tx-tests") + _mkdir_ok(agfs_client, path) + yield path + try: + agfs_client.rm(path, recursive=True) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# VectorDB fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def vector_store(tmp_path_factory): + """Session-scoped real local VectorDB backend.""" + db_path = str(tmp_path_factory.mktemp("vectordb")) + config = VectorDBBackendConfig( + backend="local", + name=COLLECTION_NAME, + path=db_path, + dimension=VECTOR_DIM, + ) + store = VikingVectorIndexBackend(config=config) + + import asyncio + + schema = CollectionSchemas.context_collection(COLLECTION_NAME, VECTOR_DIM) + asyncio.get_event_loop().run_until_complete(store.create_collection(COLLECTION_NAME, schema)) + + yield store + + asyncio.get_event_loop().run_until_complete(store.close()) + + +@pytest.fixture(scope="session") +def request_ctx(): + """Session-scoped RequestContext for VectorDB operations.""" + user = UserIdentifier("default", "test_user", "default") + return RequestContext(user=user, role=Role.ROOT) + + +# --------------------------------------------------------------------------- +# Lock fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def lock_manager(agfs_client): + """Function-scoped LockManager with real AGFS backend.""" + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) + + +@pytest.fixture +def redo_log(agfs_client): + """Function-scoped RedoLog with real AGFS backend.""" + return RedoLog(agfs_client) + + +# --------------------------------------------------------------------------- +# Utility helpers +# --------------------------------------------------------------------------- + + +def file_exists(agfs_client, path) -> bool: + """Check if a file/dir exists in AGFS.""" + try: + agfs_client.stat(path) + return True + except Exception: + return False + + +def make_lock_file(agfs_client, dir_path, tx_id, lock_type="P") -> str: + """Create a real lock file in AGFS and return its path.""" + lock_path = f"{dir_path.rstrip('/')}/{LOCK_FILE_NAME}" + token = _make_fencing_token(tx_id, lock_type) + agfs_client.write(lock_path, token.encode("utf-8")) + return lock_path diff --git a/tests/transaction/test_concurrent_lock.py b/tests/transaction/test_concurrent_lock.py new file mode 100644 index 00000000..7e25ab57 --- /dev/null +++ b/tests/transaction/test_concurrent_lock.py @@ -0,0 +1,103 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for concurrent lock acquisition using real AGFS backend.""" + +import asyncio +import uuid + +from openviking.storage.transaction.lock_handle import LockHandle +from openviking.storage.transaction.path_lock import PathLock + + +class TestConcurrentLock: + async def test_point_mutual_exclusion_same_path(self, agfs_client, test_dir): + """两个任务竞争同一路径的 POINT 锁,均最终成功(串行执行)。""" + lock = PathLock(agfs_client) + + results = {} + + async def holder(tx_id): + tx = LockHandle(id=tx_id) + ok = await lock.acquire_point(test_dir, tx, timeout=5.0) + if ok: + await asyncio.sleep(0.3) + await lock.release(tx) + results[tx_id] = ok + + await asyncio.gather( + holder("tx-conc-1"), + holder("tx-conc-2"), + ) + + # Both should eventually succeed (one waits for the other) + assert results["tx-conc-1"] is True + assert results["tx-conc-2"] is True + + async def test_subtree_blocks_concurrent_point_child(self, agfs_client, test_dir): + """SUBTREE on parent 持锁期间,子目录的 POINT 被阻塞,释放后成功。""" + child = f"{test_dir}/child-{uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + parent_acquired = asyncio.Event() + parent_released = asyncio.Event() + + child_result = {} + + async def parent_holder(): + tx = LockHandle(id="tx-sub-parent") + ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0) + assert ok is True + parent_acquired.set() + await asyncio.sleep(0.5) + await lock.release(tx) + parent_released.set() + + async def child_worker(): + await parent_acquired.wait() + tx = LockHandle(id="tx-sub-child") + ok = await lock.acquire_point(child, tx, timeout=5.0) + child_result["ok"] = ok + child_result["after_release"] = parent_released.is_set() + if ok: + await lock.release(tx) + + await asyncio.gather(parent_holder(), child_worker()) + + assert child_result["ok"] is True + # Child should succeed only after parent released + assert child_result["after_release"] is True + + async def test_point_child_blocks_concurrent_subtree_parent(self, agfs_client, test_dir): + """POINT on child 持锁期间,父目录的 SUBTREE 被阻塞,释放后成功。""" + child = f"{test_dir}/child-{uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + child_acquired = asyncio.Event() + child_released = asyncio.Event() + + parent_result = {} + + async def child_holder(): + tx = LockHandle(id="tx-rev-child") + ok = await lock.acquire_point(child, tx, timeout=5.0) + assert ok is True + child_acquired.set() + await asyncio.sleep(0.5) + await lock.release(tx) + child_released.set() + + async def parent_worker(): + await child_acquired.wait() + tx = LockHandle(id="tx-rev-parent") + ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0) + parent_result["ok"] = ok + parent_result["after_release"] = child_released.is_set() + if ok: + await lock.release(tx) + + await asyncio.gather(child_holder(), parent_worker()) + + assert parent_result["ok"] is True + assert parent_result["after_release"] is True diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py new file mode 100644 index 00000000..2f284f53 --- /dev/null +++ b/tests/transaction/test_e2e.py @@ -0,0 +1,125 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""End-to-end lock tests using real AGFS backend. + +These tests exercise LockContext -> LockManager -> PathLock -> AGFS, +verifying the acquire -> operate -> release lifecycle. +""" + +import uuid + +import pytest + +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.lock_context import LockContext +from openviking.storage.transaction.lock_manager import LockManager +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME + + +def _lock_file_gone(agfs_client, lock_path: str) -> bool: + """Return True if the lock file does not exist in AGFS.""" + try: + agfs_client.stat(lock_path) + return False + except Exception: + return True + + +@pytest.fixture +def lock_manager(agfs_client): + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) + + +class TestLockContextCommit: + async def test_lock_acquired_and_released(self, agfs_client, lock_manager, test_dir): + """Lock is held inside the context and released after exit.""" + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + token = agfs_client.cat(lock_path) + assert token is not None + + assert _lock_file_gone(agfs_client, lock_path) + + async def test_file_persists_after_context(self, agfs_client, lock_manager, test_dir): + """Files written inside a lock context persist.""" + file_path = f"{test_dir}/committed-file.txt" + + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + agfs_client.write(file_path, b"committed data") + + content = agfs_client.cat(file_path) + assert content == b"committed data" + + +class TestLockContextException: + async def test_lock_released_on_exception(self, agfs_client, lock_manager, test_dir): + """Lock is released even when an exception occurs inside the context.""" + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + with pytest.raises(RuntimeError): + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + token = agfs_client.cat(lock_path) + assert token is not None + raise RuntimeError("simulated failure") + + assert _lock_file_gone(agfs_client, lock_path) + + async def test_exception_not_swallowed(self, agfs_client, lock_manager, test_dir): + """Exceptions propagate through the context manager.""" + with pytest.raises(ValueError, match="test error"): + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + raise ValueError("test error") + + +class TestLockContextMv: + async def test_mv_lock_acquires_both_paths(self, agfs_client, lock_manager, test_dir): + """mv lock mode acquires SUBTREE on both source and destination.""" + src = f"{test_dir}/mv-src-{uuid.uuid4().hex}" + dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst): + src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}") + dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}") + src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token + dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token + assert ":S" in src_token_str + assert ":S" in dst_token_str + + for path in [f"{src}/{LOCK_FILE_NAME}", f"{dst}/{LOCK_FILE_NAME}"]: + assert _lock_file_gone(agfs_client, path) + + +class TestLockContextSubtree: + async def test_subtree_lock_and_release(self, agfs_client, lock_manager, test_dir): + """Subtree lock is acquired and released.""" + target = f"{test_dir}/sub-{uuid.uuid4().hex}" + agfs_client.mkdir(target) + + async with LockContext(lock_manager, [target], lock_mode="subtree"): + token = agfs_client.cat(f"{target}/{LOCK_FILE_NAME}") + token_str = token.decode("utf-8") if isinstance(token, bytes) else token + assert ":S" in token_str + + assert _lock_file_gone(agfs_client, f"{target}/{LOCK_FILE_NAME}") + + +class TestSequentialLocks: + async def test_sequential_locks_on_same_path(self, agfs_client, lock_manager, test_dir): + """Multiple sequential lock contexts on the same path succeed.""" + for i in range(3): + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + agfs_client.write(f"{test_dir}/f{i}.txt", f"data-{i}".encode()) + + for i in range(3): + content = agfs_client.cat(f"{test_dir}/f{i}.txt") + assert content == f"data-{i}".encode() + + async def test_lock_acquisition_failure(self, agfs_client, lock_manager, test_dir): + """LockContext raises LockAcquisitionError for nonexistent path.""" + nonexistent = f"{test_dir}/nonexistent-{uuid.uuid4().hex}" + with pytest.raises(LockAcquisitionError): + async with LockContext(lock_manager, [nonexistent], lock_mode="point"): + pass diff --git a/tests/transaction/test_lock_context.py b/tests/transaction/test_lock_context.py new file mode 100644 index 00000000..131fb48e --- /dev/null +++ b/tests/transaction/test_lock_context.py @@ -0,0 +1,85 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for LockContext async context manager.""" + +import uuid + +import pytest + +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.lock_context import LockContext +from openviking.storage.transaction.lock_manager import LockManager +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME + + +def _lock_file_gone(agfs_client, lock_path: str) -> bool: + try: + agfs_client.stat(lock_path) + return False + except Exception: + return True + + +@pytest.fixture +def lm(agfs_client): + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) + + +class TestLockContextPoint: + async def test_point_lock_lifecycle(self, agfs_client, lm, test_dir): + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + async with LockContext(lm, [test_dir], lock_mode="point") as handle: + assert handle is not None + token = agfs_client.cat(lock_path) + assert token is not None + + assert _lock_file_gone(agfs_client, lock_path) + + async def test_lock_released_on_exception(self, agfs_client, lm, test_dir): + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + with pytest.raises(RuntimeError): + async with LockContext(lm, [test_dir], lock_mode="point"): + assert agfs_client.cat(lock_path) is not None + raise RuntimeError("fail") + + assert _lock_file_gone(agfs_client, lock_path) + + async def test_exception_propagates(self, lm, test_dir): + with pytest.raises(ValueError, match="test"): + async with LockContext(lm, [test_dir], lock_mode="point"): + raise ValueError("test") + + +class TestLockContextSubtree: + async def test_subtree_lock(self, agfs_client, lm, test_dir): + async with LockContext(lm, [test_dir], lock_mode="subtree"): + token = agfs_client.cat(f"{test_dir}/{LOCK_FILE_NAME}") + token_str = token.decode("utf-8") if isinstance(token, bytes) else token + assert ":S" in token_str + + +class TestLockContextMv: + async def test_mv_lock(self, agfs_client, lm, test_dir): + src = f"{test_dir}/src-{uuid.uuid4().hex}" + dst = f"{test_dir}/dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + async with LockContext(lm, [src], lock_mode="mv", mv_dst_parent_path=dst) as handle: + assert len(handle.locks) == 2 + + +class TestLockContextFailure: + async def test_nonexistent_path_raises(self, lm): + with pytest.raises(LockAcquisitionError): + async with LockContext(lm, ["/local/nonexistent-xyz"], lock_mode="point"): + pass + + async def test_handle_cleaned_up_on_failure(self, lm): + with pytest.raises(LockAcquisitionError): + async with LockContext(lm, ["/local/nonexistent-xyz"], lock_mode="point"): + pass + + assert len(lm.get_active_handles()) == 0 diff --git a/tests/transaction/test_lock_manager.py b/tests/transaction/test_lock_manager.py new file mode 100644 index 00000000..e30f724b --- /dev/null +++ b/tests/transaction/test_lock_manager.py @@ -0,0 +1,88 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for LockManager.""" + +import uuid + +import pytest + +from openviking.storage.transaction.lock_manager import LockManager +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME + + +def _lock_file_gone(agfs_client, lock_path: str) -> bool: + try: + agfs_client.stat(lock_path) + return False + except Exception: + return True + + +@pytest.fixture +def lm(agfs_client): + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) + + +class TestLockManagerBasic: + async def test_create_handle_and_acquire_point(self, agfs_client, lm, test_dir): + handle = lm.create_handle() + ok = await lm.acquire_point(handle, test_dir) + assert ok is True + + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + content = agfs_client.cat(lock_path) + assert content is not None + + await lm.release(handle) + assert _lock_file_gone(agfs_client, lock_path) + + async def test_acquire_subtree(self, agfs_client, lm, test_dir): + handle = lm.create_handle() + ok = await lm.acquire_subtree(handle, test_dir) + assert ok is True + + token = agfs_client.cat(f"{test_dir}/{LOCK_FILE_NAME}") + token_str = token.decode("utf-8") if isinstance(token, bytes) else token + assert ":S" in token_str + + await lm.release(handle) + + async def test_acquire_mv(self, agfs_client, lm, test_dir): + src = f"{test_dir}/mv-src-{uuid.uuid4().hex}" + dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + handle = lm.create_handle() + ok = await lm.acquire_mv(handle, src, dst) + assert ok is True + assert len(handle.locks) == 2 + + await lm.release(handle) + assert handle.id not in lm.get_active_handles() + + async def test_release_removes_from_active(self, lm, test_dir): + handle = lm.create_handle() + assert handle.id in lm.get_active_handles() + + await lm.acquire_point(handle, test_dir) + await lm.release(handle) + + assert handle.id not in lm.get_active_handles() + + async def test_stop_releases_all(self, agfs_client, lm, test_dir): + h1 = lm.create_handle() + h2 = lm.create_handle() + await lm.acquire_point(h1, test_dir) + + sub = f"{test_dir}/sub-{uuid.uuid4().hex}" + agfs_client.mkdir(sub) + await lm.acquire_point(h2, sub) + + await lm.stop() + assert len(lm.get_active_handles()) == 0 + + async def test_nonexistent_path_fails(self, lm): + handle = lm.create_handle() + ok = await lm.acquire_point(handle, "/local/nonexistent-xyz") + assert ok is False diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py new file mode 100644 index 00000000..0b721e07 --- /dev/null +++ b/tests/transaction/test_path_lock.py @@ -0,0 +1,336 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for path lock with fencing tokens.""" + +import time +from unittest.mock import MagicMock + +from openviking.storage.transaction.lock_handle import LockHandle +from openviking.storage.transaction.path_lock import ( + LOCK_FILE_NAME, + LOCK_TYPE_POINT, + LOCK_TYPE_SUBTREE, + PathLock, + _make_fencing_token, + _parse_fencing_token, +) + + +class TestFencingToken: + def test_make_parse_roundtrip(self): + token = _make_fencing_token("tx-123") + tx_id, ts, lock_type = _parse_fencing_token(token) + assert tx_id == "tx-123" + assert ts > 0 + assert lock_type == LOCK_TYPE_POINT + + def test_make_parse_subtree_roundtrip(self): + token = _make_fencing_token("tx-456", LOCK_TYPE_SUBTREE) + tx_id, ts, lock_type = _parse_fencing_token(token) + assert tx_id == "tx-456" + assert ts > 0 + assert lock_type == LOCK_TYPE_SUBTREE + + def test_parse_legacy_format_two_part(self): + """Legacy two-part token "{tx_id}:{ts}" defaults to POINT.""" + tx_id, ts, lock_type = _parse_fencing_token("tx-old:1234567890") + assert tx_id == "tx-old" + assert ts == 1234567890 + assert lock_type == LOCK_TYPE_POINT + + def test_parse_legacy_format_plain(self): + """Plain tx_id (no colon) defaults to ts=0, lock_type=POINT.""" + tx_id, ts, lock_type = _parse_fencing_token("tx-bare") + assert tx_id == "tx-bare" + assert ts == 0 + assert lock_type == LOCK_TYPE_POINT + + def test_tokens_are_unique(self): + t1 = _make_fencing_token("tx-1") + time.sleep(0.001) + t2 = _make_fencing_token("tx-1") + assert t1 != t2 + + +class TestPathLockStale: + def test_is_lock_stale_no_file(self): + agfs = MagicMock() + agfs.read.side_effect = Exception("not found") + lock = PathLock(agfs) + assert lock.is_lock_stale("/test/.path.ovlock") is True + + def test_is_lock_stale_legacy_token(self): + agfs = MagicMock() + agfs.read.return_value = b"tx-old-format" + lock = PathLock(agfs) + assert lock.is_lock_stale("/test/.path.ovlock") is True + + def test_is_lock_stale_recent_token(self): + agfs = MagicMock() + token = _make_fencing_token("tx-1") + agfs.read.return_value = token.encode("utf-8") + lock = PathLock(agfs) + assert lock.is_lock_stale("/test/.path.ovlock", expire_seconds=300.0) is False + + +class TestPathLockBehavior: + """Behavioral tests using real AGFS backend.""" + + async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + tx = LockHandle(id="tx-point-1") + + ok = await lock.acquire_point(test_dir, tx, timeout=3.0) + assert ok is True + + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + content = agfs_client.cat(lock_path) + token = content.decode("utf-8") if isinstance(content, bytes) else content + assert ":P" in token + assert "tx-point-1" in token + + await lock.release(tx) + + async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + tx = LockHandle(id="tx-subtree-1") + + ok = await lock.acquire_subtree(test_dir, tx, timeout=3.0) + assert ok is True + + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + content = agfs_client.cat(lock_path) + token = content.decode("utf-8") if isinstance(content, bytes) else content + assert ":S" in token + assert "tx-subtree-1" in token + + await lock.release(tx) + + async def test_acquire_point_dir_not_found(self, agfs_client): + lock = PathLock(agfs_client) + tx = LockHandle(id="tx-no-dir") + + ok = await lock.acquire_point("/local/nonexistent-path-xyz", tx, timeout=0.5) + assert ok is False + assert len(tx.locks) == 0 + + async def test_release_removes_lock_file(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + tx = LockHandle(id="tx-release-1") + + await lock.acquire_point(test_dir, tx, timeout=3.0) + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + await lock.release(tx) + + # Lock file should be gone (use stat, not cat — cat returns b'' for deleted files) + try: + agfs_client.stat(lock_path) + raise AssertionError("Lock file should have been removed") + except AssertionError: + raise + except Exception: + pass # Expected: file not found + + async def test_sequential_acquire_works(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + + tx1 = LockHandle(id="tx-seq-1") + ok1 = await lock.acquire_point(test_dir, tx1, timeout=3.0) + assert ok1 is True + + await lock.release(tx1) + + tx2 = LockHandle(id="tx-seq-2") + ok2 = await lock.acquire_point(test_dir, tx2, timeout=3.0) + assert ok2 is True + + await lock.release(tx2) + + async def test_point_blocked_by_ancestor_subtree(self, agfs_client, test_dir): + """POINT on child blocked while ancestor holds SUBTREE lock.""" + import uuid as _uuid + + child = f"{test_dir}/child-{_uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + tx_parent = LockHandle(id="tx-parent-subtree") + ok = await lock.acquire_subtree(test_dir, tx_parent, timeout=3.0) + assert ok is True + + tx_child = LockHandle(id="tx-child-point") + blocked = await lock.acquire_point(child, tx_child, timeout=0.5) + assert blocked is False + + await lock.release(tx_parent) + + async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir): + """SUBTREE on parent blocked while descendant holds POINT lock.""" + import uuid as _uuid + + child = f"{test_dir}/child-{_uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + tx_child = LockHandle(id="tx-desc-point") + ok = await lock.acquire_point(child, tx_child, timeout=3.0) + assert ok is True + + tx_parent = LockHandle(id="tx-parent-sub") + blocked = await lock.acquire_subtree(test_dir, tx_parent, timeout=0.5) + assert blocked is False + + await lock.release(tx_child) + + async def test_acquire_mv_creates_subtree_locks(self, agfs_client, test_dir): + """acquire_mv puts SUBTREE on both src and dst.""" + import uuid as _uuid + + src = f"{test_dir}/src-{_uuid.uuid4().hex}" + dst = f"{test_dir}/dst-{_uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + lock = PathLock(agfs_client) + tx = LockHandle(id="tx-mv-1") + ok = await lock.acquire_mv(src, dst, tx, timeout=3.0) + assert ok is True + + src_token_bytes = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}") + src_token = ( + src_token_bytes.decode("utf-8") + if isinstance(src_token_bytes, bytes) + else src_token_bytes + ) + assert ":S" in src_token + + dst_token_bytes = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}") + dst_token = ( + dst_token_bytes.decode("utf-8") + if isinstance(dst_token_bytes, bytes) + else dst_token_bytes + ) + assert ":S" in dst_token + + await lock.release(tx) + + async def test_point_does_not_block_sibling_point(self, agfs_client, test_dir): + """POINT locks on different directories do not conflict.""" + import uuid as _uuid + + dir_a = f"{test_dir}/sibling-a-{_uuid.uuid4().hex}" + dir_b = f"{test_dir}/sibling-b-{_uuid.uuid4().hex}" + agfs_client.mkdir(dir_a) + agfs_client.mkdir(dir_b) + + lock = PathLock(agfs_client) + tx_a = LockHandle(id="tx-sib-a") + tx_b = LockHandle(id="tx-sib-b") + + ok_a = await lock.acquire_point(dir_a, tx_a, timeout=3.0) + ok_b = await lock.acquire_point(dir_b, tx_b, timeout=3.0) + + assert ok_a is True + assert ok_b is True + + await lock.release(tx_a) + await lock.release(tx_b) + + async def test_stale_lock_auto_removed_on_acquire(self, agfs_client, test_dir): + """A stale lock (expired fencing token) is auto-removed, allowing a new acquire.""" + import uuid as _uuid + + target = f"{test_dir}/stale-{_uuid.uuid4().hex}" + agfs_client.mkdir(target) + + lock_path = f"{target}/{LOCK_FILE_NAME}" + + # Write a lock file with a very old timestamp (simulate crashed process) + old_ts = time.time_ns() - int(600 * 1e9) # 600 seconds ago + stale_token = f"tx-dead:{old_ts}:{LOCK_TYPE_POINT}" + agfs_client.write(lock_path, stale_token.encode("utf-8")) + + # New transaction should succeed by auto-removing the stale lock + lock = PathLock(agfs_client, lock_expire=300.0) + tx = LockHandle(id="tx-new-owner") + ok = await lock.acquire_point(target, tx, timeout=2.0) + assert ok is True + + # Verify new lock is owned by our transaction + content = agfs_client.cat(lock_path) + token = content.decode("utf-8") if isinstance(content, bytes) else content + assert "tx-new-owner" in token + + await lock.release(tx) + + async def test_stale_subtree_ancestor_auto_removed(self, agfs_client, test_dir): + """A stale SUBTREE lock on ancestor is auto-removed when child acquires POINT.""" + import uuid as _uuid + + child = f"{test_dir}/child-stale-{_uuid.uuid4().hex}" + agfs_client.mkdir(child) + + # Write stale SUBTREE lock on parent + parent_lock = f"{test_dir}/{LOCK_FILE_NAME}" + old_ts = time.time_ns() - int(600 * 1e9) + stale_token = f"tx-dead-parent:{old_ts}:{LOCK_TYPE_SUBTREE}" + agfs_client.write(parent_lock, stale_token.encode("utf-8")) + + lock = PathLock(agfs_client, lock_expire=300.0) + tx = LockHandle(id="tx-child-new") + ok = await lock.acquire_point(child, tx, timeout=2.0) + assert ok is True + + await lock.release(tx) + # Clean up stale parent lock if still present + try: + agfs_client.rm(parent_lock) + except Exception: + pass + + async def test_point_same_path_no_wait_fails_immediately(self, agfs_client, test_dir): + """With timeout=0, a conflicting lock fails immediately.""" + import uuid as _uuid + + target = f"{test_dir}/nowait-{_uuid.uuid4().hex}" + agfs_client.mkdir(target) + + lock = PathLock(agfs_client) + tx1 = LockHandle(id="tx-hold") + ok1 = await lock.acquire_point(target, tx1, timeout=3.0) + assert ok1 is True + + # Second acquire with timeout=0 should fail immediately + tx2 = LockHandle(id="tx-blocked") + t0 = time.monotonic() + ok2 = await lock.acquire_point(target, tx2, timeout=0.0) + elapsed = time.monotonic() - t0 + + assert ok2 is False + assert elapsed < 1.0 # Should not wait + + await lock.release(tx1) + + async def test_subtree_same_path_mutual_exclusion(self, agfs_client, test_dir): + """Two SUBTREE locks on the same path: second one blocked until first releases.""" + import uuid as _uuid + + target = f"{test_dir}/sub-excl-{_uuid.uuid4().hex}" + agfs_client.mkdir(target) + + lock = PathLock(agfs_client) + tx1 = LockHandle(id="tx-sub1") + ok1 = await lock.acquire_subtree(target, tx1, timeout=3.0) + assert ok1 is True + + tx2 = LockHandle(id="tx-sub2") + ok2 = await lock.acquire_subtree(target, tx2, timeout=0.5) + assert ok2 is False + + await lock.release(tx1) + + # Now tx2 should succeed + ok2_retry = await lock.acquire_subtree(target, tx2, timeout=3.0) + assert ok2_retry is True + await lock.release(tx2) diff --git a/tests/transaction/test_redo_log.py b/tests/transaction/test_redo_log.py new file mode 100644 index 00000000..8a0def2c --- /dev/null +++ b/tests/transaction/test_redo_log.py @@ -0,0 +1,78 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for RedoLog crash recovery.""" + +import uuid + +import pytest + +from openviking.storage.transaction.redo_log import RedoLog + + +@pytest.fixture +def redo(agfs_client): + return RedoLog(agfs_client) + + +class TestRedoLogBasic: + def test_write_and_read(self, redo): + task_id = uuid.uuid4().hex + info = {"archive_uri": "viking://test/archive", "session_uri": "viking://test/session"} + redo.write_pending(task_id, info) + + result = redo.read(task_id) + assert result["archive_uri"] == "viking://test/archive" + assert result["session_uri"] == "viking://test/session" + + redo.mark_done(task_id) + + def test_list_pending(self, redo): + t1 = uuid.uuid4().hex + t2 = uuid.uuid4().hex + redo.write_pending(t1, {"key": "v1"}) + redo.write_pending(t2, {"key": "v2"}) + + pending = redo.list_pending() + assert t1 in pending + assert t2 in pending + + redo.mark_done(t1) + pending_after = redo.list_pending() + assert t1 not in pending_after + assert t2 in pending_after + + redo.mark_done(t2) + + def test_mark_done_removes_task(self, redo): + task_id = uuid.uuid4().hex + redo.write_pending(task_id, {"x": 1}) + redo.mark_done(task_id) + + pending = redo.list_pending() + assert task_id not in pending + + def test_read_nonexistent_returns_empty(self, redo): + result = redo.read("nonexistent-task-id") + assert result == {} + + def test_list_pending_empty(self, redo): + # Should not crash even if _REDO_ROOT doesn't exist yet + pending = redo.list_pending() + assert isinstance(pending, list) + + def test_mark_done_idempotent(self, redo): + task_id = uuid.uuid4().hex + redo.write_pending(task_id, {"x": 1}) + redo.mark_done(task_id) + # Second mark_done should not raise + redo.mark_done(task_id) + + def test_overwrite_pending(self, redo): + task_id = uuid.uuid4().hex + redo.write_pending(task_id, {"version": 1}) + redo.write_pending(task_id, {"version": 2}) + + result = redo.read(task_id) + assert result["version"] == 2 + + redo.mark_done(task_id) diff --git a/tests/unit/test_openai_embedder.py b/tests/unit/test_openai_embedder.py index 2f66567e..02df2a23 100644 --- a/tests/unit/test_openai_embedder.py +++ b/tests/unit/test_openai_embedder.py @@ -233,3 +233,67 @@ def test_telemetry_skipped_when_no_usage(self, mock_openai_class): ) result = embedder.embed("Hello world") assert result.dense_vector is not None + + @patch("openviking.models.embedder.openai_embedders.openai.OpenAI") + def test_telemetry_skipped_when_module_missing(self, mock_openai_class): + """_update_telemetry_token_usage should silently no-op when telemetry module is not available""" + mock_client = MagicMock() + mock_openai_class.return_value = mock_client + + mock_embedding = MagicMock() + mock_embedding.embedding = [0.1] * 1536 + + mock_usage = MagicMock() + mock_usage.prompt_tokens = 10 + mock_usage.total_tokens = 10 + + mock_response = MagicMock() + mock_response.data = [mock_embedding] + mock_response.usage = mock_usage + mock_client.embeddings.create.return_value = mock_response + + embedder = OpenAIDenseEmbedder( + model_name="text-embedding-3-small", + api_key="test-api-key", + dimension=1536, + ) + + with patch("importlib.import_module", side_effect=ImportError("no telemetry")): + result = embedder.embed("Hello world") + + assert result.dense_vector is not None + + @patch("openviking.models.embedder.openai_embedders.openai.OpenAI") + def test_telemetry_called_when_module_available(self, mock_openai_class): + """_update_telemetry_token_usage should call telemetry when module is available""" + mock_client = MagicMock() + mock_openai_class.return_value = mock_client + + mock_embedding = MagicMock() + mock_embedding.embedding = [0.1] * 1536 + + mock_usage = MagicMock() + mock_usage.prompt_tokens = 8 + mock_usage.total_tokens = 8 + + mock_response = MagicMock() + mock_response.data = [mock_embedding] + mock_response.usage = mock_usage + mock_client.embeddings.create.return_value = mock_response + + embedder = OpenAIDenseEmbedder( + model_name="text-embedding-3-small", + api_key="test-api-key", + dimension=1536, + ) + + mock_telemetry = MagicMock() + + with patch( + "openviking.models.embedder.openai_embedders.get_current_telemetry", + return_value=mock_telemetry, + ): + result = embedder.embed("Hello world") + + assert result.dense_vector is not None + mock_telemetry.add_token_usage_by_source.assert_called_once_with("embedding", 8, 0) diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go index f2ccde99..c20fdc66 100644 --- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go @@ -24,9 +24,18 @@ type QueueBackend interface { // Enqueue adds a message to a queue Enqueue(queueName string, msg QueueMessage) error - // Dequeue removes and returns the first message from a queue + // Dequeue marks the first pending message as 'processing' and returns it. + // Call Ack after successful processing to permanently delete the message. Dequeue(queueName string) (QueueMessage, bool, error) + // Ack permanently deletes a message that has been successfully processed. + Ack(queueName string, messageID string) error + + // RecoverStale resets messages stuck in 'processing' state back to 'pending'. + // staleSec: minimum age in seconds; pass 0 to reset all processing messages. + // Returns the number of messages recovered. + RecoverStale(staleSec int64) (int, error) + // Peek returns the first message without removing it Peek(queueName string) (QueueMessage, bool, error) @@ -124,6 +133,16 @@ func (b *MemoryBackend) Dequeue(queueName string) (QueueMessage, bool, error) { return msg, true, nil } +// Ack is a no-op for the memory backend (messages are already removed on Dequeue). +func (b *MemoryBackend) Ack(queueName string, messageID string) error { + return nil +} + +// RecoverStale is a no-op for the memory backend (no persistence across restarts). +func (b *MemoryBackend) RecoverStale(staleSec int64) (int, error) { + return 0, nil +} + func (b *MemoryBackend) Peek(queueName string) (QueueMessage, bool, error) { queue, exists := b.queues[queueName] if !exists { @@ -345,6 +364,16 @@ func (b *TiDBBackend) Enqueue(queueName string, msg QueueMessage) error { return nil } +// Ack is not yet implemented for TiDB backend (messages are already soft-deleted on Dequeue). +func (b *TiDBBackend) Ack(queueName string, messageID string) error { + return nil +} + +// RecoverStale is not yet implemented for TiDB backend. +func (b *TiDBBackend) RecoverStale(staleSec int64) (int, error) { + return 0, nil +} + func (b *TiDBBackend) Dequeue(queueName string) (QueueMessage, bool, error) { // Get table name from cache (lazy loading) tableName, err := b.getTableName(queueName, false) diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go index 03b7342f..9639531c 100644 --- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go @@ -63,16 +63,22 @@ func (b *SQLiteDBBackend) GetInitSQL() []string { last_updated INTEGER DEFAULT (strftime('%s', 'now')) )`, // Queue messages table + // status: 'pending' (waiting) | 'processing' (dequeued, not yet acked) + // processing_started_at: Unix timestamp when dequeued; NULL if pending `CREATE TABLE IF NOT EXISTS queue_messages ( id INTEGER PRIMARY KEY AUTOINCREMENT, queue_name TEXT NOT NULL, message_id TEXT NOT NULL, data TEXT NOT NULL, timestamp INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + processing_started_at INTEGER, created_at INTEGER DEFAULT (strftime('%s', 'now')) )`, `CREATE INDEX IF NOT EXISTS idx_queue_name ON queue_messages(queue_name)`, `CREATE INDEX IF NOT EXISTS idx_queue_order ON queue_messages(queue_name, id)`, + `CREATE INDEX IF NOT EXISTS idx_queue_status ON queue_messages(queue_name, status, id)`, + `CREATE INDEX IF NOT EXISTS idx_queue_message_id ON queue_messages(queue_name, message_id)`, } } diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go index d8d481b0..052a8f19 100644 --- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go @@ -137,7 +137,9 @@ func (q *QueueFSPlugin) Initialize(cfg map[string]interface{}) error { switch backendType { case "memory": backend = NewMemoryBackend() - case "tidb", "mysql", "sqlite", "sqlite3": + case "sqlite", "sqlite3": + backend = NewSQLiteQueueBackend() + case "tidb", "mysql": backend = NewTiDBBackend() default: return fmt.Errorf("unsupported backend: %s", backendType) @@ -384,6 +386,7 @@ var queueOperations = map[string]bool{ "peek": true, "size": true, "clear": true, + "ack": true, // write message_id to confirm processing complete (at-least-once delivery) } // parseQueuePath parses a path like "/queue_name/operation" or "/dir/queue_name/operation" @@ -529,7 +532,7 @@ func (qfs *queueFS) Read(path string, offset int64, size int64) ([]byte, error) data, err = qfs.peek(queueName) case "size": data, err = qfs.size(queueName) - case "enqueue", "clear": + case "enqueue", "clear", "ack": // Write-only files return []byte(""), fmt.Errorf("permission denied: %s is write-only", path) default: @@ -573,6 +576,12 @@ func (qfs *queueFS) Write(path string, data []byte, offset int64, flags filesyst return 0, err } return 0, nil + case "ack": + msgID := strings.TrimSpace(string(data)) + if err := qfs.ackMessage(queueName, msgID); err != nil { + return 0, err + } + return int64(len(data)), nil default: return 0, fmt.Errorf("cannot write to: %s", path) } @@ -844,7 +853,7 @@ func (qfs *queueFS) Stat(p string) (*filesystem.FileInfo, error) { } mode := uint32(0644) - if operation == "enqueue" || operation == "clear" { + if operation == "enqueue" || operation == "clear" || operation == "ack" { mode = 0222 } else { mode = 0444 @@ -992,6 +1001,13 @@ func (qfs *queueFS) clear(queueName string) error { return qfs.plugin.backend.Clear(queueName) } +func (qfs *queueFS) ackMessage(queueName string, msgID string) error { + qfs.plugin.mu.Lock() + defer qfs.plugin.mu.Unlock() + + return qfs.plugin.backend.Ack(queueName, msgID) +} + // Ensure QueueFSPlugin implements ServicePlugin var _ plugin.ServicePlugin = (*QueueFSPlugin)(nil) var _ filesystem.FileSystem = (*queueFS)(nil) diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go new file mode 100644 index 00000000..2a0c4dbe --- /dev/null +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go @@ -0,0 +1,321 @@ +package queuefs + +import ( + "database/sql" + "encoding/json" + "fmt" + "strings" + "time" + + log "github.com/sirupsen/logrus" +) + +// SQLiteQueueBackend implements QueueBackend using SQLite with a single-table schema. +// +// Schema: +// - queue_metadata: tracks all queues (including empty ones created via mkdir) +// - queue_messages: stores all messages, filtered by queue_name column +// - status: 'pending' (waiting to be processed) | 'processing' (dequeued, awaiting ack) +// - processing_started_at: Unix timestamp when dequeued; NULL while pending +// +// Delivery semantics: at-least-once +// - Dequeue marks message as 'processing' (does NOT delete) +// - Ack deletes the message after successful processing +// - On startup, RecoverStale resets all 'processing' messages back to 'pending' +// so that messages from a previous crashed run are automatically retried +type SQLiteQueueBackend struct { + db *sql.DB +} + +func NewSQLiteQueueBackend() *SQLiteQueueBackend { + return &SQLiteQueueBackend{} +} + +func (b *SQLiteQueueBackend) Initialize(config map[string]interface{}) error { + dbBackend := NewSQLiteDBBackend() + + db, err := dbBackend.Open(config) + if err != nil { + return fmt.Errorf("failed to open SQLite database: %w", err) + } + b.db = db + + for _, sqlStmt := range dbBackend.GetInitSQL() { + if _, err := db.Exec(sqlStmt); err != nil { + db.Close() + return fmt.Errorf("failed to initialize schema: %w", err) + } + } + + // Migrate existing databases: add new columns if they don't exist yet. + b.runMigrations() + + // Reset any messages left in 'processing' state by a previous crashed process. + // staleSec=0 resets ALL processing messages — safe at startup because no workers + // are running yet. + if n, err := b.RecoverStale(0); err != nil { + log.Warnf("[queuefs] Failed to recover stale messages on startup: %v", err) + } else if n > 0 { + log.Infof("[queuefs] Recovered %d in-flight message(s) from previous run", n) + } + + log.Info("[queuefs] SQLite backend initialized") + return nil +} + +// runMigrations applies schema changes needed to upgrade an existing database. +// Each ALTER TABLE is executed and "duplicate column name" errors are silently ignored. +func (b *SQLiteQueueBackend) runMigrations() { + migrations := []string{ + `ALTER TABLE queue_messages ADD COLUMN status TEXT NOT NULL DEFAULT 'pending'`, + `ALTER TABLE queue_messages ADD COLUMN processing_started_at INTEGER`, + `CREATE INDEX IF NOT EXISTS idx_queue_status ON queue_messages(queue_name, status, id)`, + `CREATE INDEX IF NOT EXISTS idx_queue_message_id ON queue_messages(queue_name, message_id)`, + } + for _, stmt := range migrations { + if _, err := b.db.Exec(stmt); err != nil { + // "duplicate column name" means the column already exists — that's fine. + if !strings.Contains(err.Error(), "duplicate column name") && + !strings.Contains(err.Error(), "already exists") { + log.Warnf("[queuefs] Migration warning: %v", err) + } + } + } +} + +func (b *SQLiteQueueBackend) Close() error { + if b.db != nil { + return b.db.Close() + } + return nil +} + +func (b *SQLiteQueueBackend) GetType() string { + return "sqlite" +} + +func (b *SQLiteQueueBackend) Enqueue(queueName string, msg QueueMessage) error { + msgData, err := json.Marshal(msg) + if err != nil { + return fmt.Errorf("failed to marshal message: %w", err) + } + + _, err = b.db.Exec( + "INSERT INTO queue_messages (queue_name, message_id, data, timestamp, status) VALUES (?, ?, ?, ?, 'pending')", + queueName, msg.ID, string(msgData), msg.Timestamp.Unix(), + ) + if err != nil { + return fmt.Errorf("failed to enqueue message: %w", err) + } + return nil +} + +// Dequeue marks the first pending message as 'processing' and returns it. +// The message remains in the database until Ack is called. +// If the process crashes before Ack, RecoverStale on the next startup will +// reset the message back to 'pending' so it is retried. +func (b *SQLiteQueueBackend) Dequeue(queueName string) (QueueMessage, bool, error) { + tx, err := b.db.Begin() + if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to start transaction: %w", err) + } + defer tx.Rollback() + + var id int64 + var data string + err = tx.QueryRow( + "SELECT id, data FROM queue_messages WHERE queue_name = ? AND status = 'pending' ORDER BY id LIMIT 1", + queueName, + ).Scan(&id, &data) + + if err == sql.ErrNoRows { + return QueueMessage{}, false, nil + } else if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to query message: %w", err) + } + + // Mark as processing instead of deleting. + _, err = tx.Exec( + "UPDATE queue_messages SET status = 'processing', processing_started_at = ? WHERE id = ?", + time.Now().Unix(), id, + ) + if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to mark message as processing: %w", err) + } + + if err := tx.Commit(); err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to commit transaction: %w", err) + } + + var msg QueueMessage + if err := json.Unmarshal([]byte(data), &msg); err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to unmarshal message: %w", err) + } + + return msg, true, nil +} + +// Ack deletes a message that has been successfully processed. +// Should be called after the consumer has finished processing the message. +func (b *SQLiteQueueBackend) Ack(queueName string, messageID string) error { + result, err := b.db.Exec( + "DELETE FROM queue_messages WHERE queue_name = ? AND message_id = ? AND status = 'processing'", + queueName, messageID, + ) + if err != nil { + return fmt.Errorf("failed to ack message: %w", err) + } + rows, _ := result.RowsAffected() + if rows == 0 { + log.Warnf("[queuefs] Ack found no matching processing message: queue=%s msg=%s", queueName, messageID) + } + return nil +} + +// RecoverStale resets messages stuck in 'processing' state back to 'pending'. +// staleSec is the minimum age (in seconds) of a processing message before it +// is considered stale. Pass 0 to reset ALL processing messages immediately +// (appropriate at startup before any workers have started). +// Returns the number of messages recovered. +func (b *SQLiteQueueBackend) RecoverStale(staleSec int64) (int, error) { + cutoff := time.Now().Unix() - staleSec + result, err := b.db.Exec( + "UPDATE queue_messages SET status = 'pending', processing_started_at = NULL WHERE status = 'processing' AND processing_started_at <= ?", + cutoff, + ) + if err != nil { + return 0, fmt.Errorf("failed to recover stale messages: %w", err) + } + n, _ := result.RowsAffected() + return int(n), nil +} + +func (b *SQLiteQueueBackend) Peek(queueName string) (QueueMessage, bool, error) { + var data string + err := b.db.QueryRow( + "SELECT data FROM queue_messages WHERE queue_name = ? AND status = 'pending' ORDER BY id LIMIT 1", + queueName, + ).Scan(&data) + + if err == sql.ErrNoRows { + return QueueMessage{}, false, nil + } else if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to peek message: %w", err) + } + + var msg QueueMessage + if err := json.Unmarshal([]byte(data), &msg); err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to unmarshal message: %w", err) + } + + return msg, true, nil +} + +// Size returns the number of pending (not yet dequeued) messages. +func (b *SQLiteQueueBackend) Size(queueName string) (int, error) { + var count int + err := b.db.QueryRow( + "SELECT COUNT(*) FROM queue_messages WHERE queue_name = ? AND status = 'pending'", + queueName, + ).Scan(&count) + if err != nil { + return 0, fmt.Errorf("failed to get queue size: %w", err) + } + return count, nil +} + +func (b *SQLiteQueueBackend) Clear(queueName string) error { + _, err := b.db.Exec("DELETE FROM queue_messages WHERE queue_name = ?", queueName) + if err != nil { + return fmt.Errorf("failed to clear queue: %w", err) + } + return nil +} + +func (b *SQLiteQueueBackend) ListQueues(prefix string) ([]string, error) { + var query string + var args []interface{} + + if prefix == "" { + query = "SELECT queue_name FROM queue_metadata" + } else { + query = "SELECT queue_name FROM queue_metadata WHERE queue_name = ? OR queue_name LIKE ?" + args = []interface{}{prefix, prefix + "/%"} + } + + rows, err := b.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("failed to list queues: %w", err) + } + defer rows.Close() + + var queues []string + for rows.Next() { + var qName string + if err := rows.Scan(&qName); err != nil { + return nil, fmt.Errorf("failed to scan queue name: %w", err) + } + queues = append(queues, qName) + } + return queues, nil +} + +func (b *SQLiteQueueBackend) GetLastEnqueueTime(queueName string) (time.Time, error) { + var timestamp sql.NullInt64 + err := b.db.QueryRow( + "SELECT MAX(timestamp) FROM queue_messages WHERE queue_name = ? AND status = 'pending'", + queueName, + ).Scan(×tamp) + + if err != nil || !timestamp.Valid { + return time.Time{}, nil + } + return time.Unix(timestamp.Int64, 0), nil +} + +func (b *SQLiteQueueBackend) RemoveQueue(queueName string) error { + if queueName == "" { + if _, err := b.db.Exec("DELETE FROM queue_messages"); err != nil { + return err + } + _, err := b.db.Exec("DELETE FROM queue_metadata") + return err + } + + if _, err := b.db.Exec( + "DELETE FROM queue_messages WHERE queue_name = ? OR queue_name LIKE ?", + queueName, queueName+"/%", + ); err != nil { + return fmt.Errorf("failed to remove queue messages: %w", err) + } + + _, err := b.db.Exec( + "DELETE FROM queue_metadata WHERE queue_name = ? OR queue_name LIKE ?", + queueName, queueName+"/%", + ) + return err +} + +func (b *SQLiteQueueBackend) CreateQueue(queueName string) error { + _, err := b.db.Exec( + "INSERT OR IGNORE INTO queue_metadata (queue_name) VALUES (?)", + queueName, + ) + if err != nil { + return fmt.Errorf("failed to create queue: %w", err) + } + log.Infof("[queuefs] Created queue '%s' (SQLite)", queueName) + return nil +} + +func (b *SQLiteQueueBackend) QueueExists(queueName string) (bool, error) { + var count int + err := b.db.QueryRow( + "SELECT COUNT(*) FROM queue_metadata WHERE queue_name = ?", + queueName, + ).Scan(&count) + if err != nil { + return false, fmt.Errorf("failed to check queue existence: %w", err) + } + return count > 0, nil +}