From bb53102c10df403307ecd8f795ffb3c6f23a0e77 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Thu, 5 Mar 2026 15:10:27 +0800 Subject: [PATCH 01/18] feat(storage): add transaction support with journal, undo, and crash recovery Implement a full transaction system for VikingFS storage operations including write-ahead journal, path locking, undo/rollback, context manager API, and crash recovery. Includes comprehensive tests and documentation. Co-Authored-By: Claude Opus 4.6 --- docs/en/concepts/09-transaction.md | 330 ++++++++++ docs/en/guides/01-configuration.md | 30 +- docs/zh/concepts/09-transaction.md | 367 ++++++++--- docs/zh/guides/01-configuration.md | 32 +- openviking/agfs_manager.py | 15 + openviking/parse/tree_builder.py | 62 +- openviking/service/core.py | 22 +- openviking/session/session.py | 145 ++++- openviking/storage/errors.py | 12 + openviking/storage/queuefs/named_queue.py | 32 +- openviking/storage/queuefs/queue_manager.py | 21 +- openviking/storage/queuefs/semantic_dag.py | 41 +- .../storage/queuefs/semantic_processor.py | 61 +- openviking/storage/transaction/__init__.py | 9 +- .../storage/transaction/context_manager.py | 146 +++++ openviking/storage/transaction/journal.py | 114 ++++ openviking/storage/transaction/path_lock.py | 609 ++++++++++++------ .../transaction/transaction_manager.py | 267 +++++++- .../storage/transaction/transaction_record.py | 71 +- openviking/storage/transaction/undo.py | 147 +++++ openviking/storage/viking_fs.py | 145 ++++- .../storage/viking_vector_index_backend.py | 16 +- openviking/utils/agfs_utils.py | 4 + openviking_cli/utils/config/storage_config.py | 6 + .../utils/config/transaction_config.py | 37 ++ tests/transaction/__init__.py | 0 tests/transaction/conftest.py | 56 ++ tests/transaction/test_concurrent_lock.py | 103 +++ tests/transaction/test_context_manager.py | 224 +++++++ tests/transaction/test_crash_recovery.py | 385 +++++++++++ tests/transaction/test_e2e.py | 238 +++++++ tests/transaction/test_journal.py | 215 +++++++ tests/transaction/test_path_lock.py | 334 ++++++++++ tests/transaction/test_post_actions.py | 112 ++++ tests/transaction/test_rm_rollback.py | 233 +++++++ tests/transaction/test_transaction_manager.py | 323 ++++++++++ tests/transaction/test_undo.py | 163 +++++ .../pkg/plugins/queuefs/backend.go | 31 +- .../pkg/plugins/queuefs/db_backend.go | 6 + .../pkg/plugins/queuefs/queuefs.go | 22 +- .../pkg/plugins/queuefs/sqlite_backend.go | 321 +++++++++ 41 files changed, 5061 insertions(+), 446 deletions(-) create mode 100644 docs/en/concepts/09-transaction.md create mode 100644 openviking/storage/transaction/context_manager.py create mode 100644 openviking/storage/transaction/journal.py create mode 100644 openviking/storage/transaction/undo.py create mode 100644 openviking_cli/utils/config/transaction_config.py create mode 100644 tests/transaction/__init__.py create mode 100644 tests/transaction/conftest.py create mode 100644 tests/transaction/test_concurrent_lock.py create mode 100644 tests/transaction/test_context_manager.py create mode 100644 tests/transaction/test_crash_recovery.py create mode 100644 tests/transaction/test_e2e.py create mode 100644 tests/transaction/test_journal.py create mode 100644 tests/transaction/test_path_lock.py create mode 100644 tests/transaction/test_post_actions.py create mode 100644 tests/transaction/test_rm_rollback.py create mode 100644 tests/transaction/test_transaction_manager.py create mode 100644 tests/transaction/test_undo.py create mode 100644 third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md new file mode 100644 index 00000000..65ec4c3b --- /dev/null +++ b/docs/en/concepts/09-transaction.md @@ -0,0 +1,330 @@ +# Transaction Mechanism + +OpenViking's transaction mechanism protects the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur. + +## Design Philosophy + +OpenViking is a context database where FS is the source of truth and VectorDB is a derived index. A lost index can be rebuilt from source data, but lost source data is unrecoverable. Therefore: + +> **Better to miss a search result than to return a bad one.** + +## Design Principles + +1. **Transactions cover synchronous operations only**: FS + VectorDB operations run inside transactions; SemanticQueue/EmbeddingQueue enqueue runs after commit (as post_actions) — they are idempotent and retriable +2. **On by default**: All data operations automatically use transactions; no extra configuration needed +3. **Write-exclusive**: Path locks ensure only one write transaction can operate on a path at a time +4. **Undo Log model**: Record reverse operations before each change; replay them in reverse order on failure +5. **Persistent journal**: Each transaction writes a journal file to AGFS for crash recovery + +## Architecture + +``` +Service Layer (rm / mv / add_resource / session.commit) + | + v ++--[TransactionContext async context manager]--+ +| | +| 1. Create transaction + write journal | +| 2. Acquire path lock (poll + timeout) | +| 3. Execute operations (FS + VectorDB) | +| 4. Record Undo Log (mark completed) | +| 5. Commit / Rollback | +| 6. Execute post_actions (enqueue etc) | +| 7. Release lock + clean up journal | +| | +| On exception: reverse Undo Log + unlock | ++----------------------------------------------+ + | + v +Storage Layer (VikingFS, VectorDB, QueueManager) +``` + +## Consistency Issues and Solutions + +### rm(uri) + +| Problem | Solution | +|---------|----------| +| Delete file first, then index -> file gone but index remains -> search returns non-existent file | **Reverse order**: delete index first, then file. Index deletion failure -> both file and index intact | + +Transaction flow: + +``` +1. Begin transaction, acquire lock (lock_mode="subtree") +2. Snapshot VectorDB records (for rollback recovery) +3. Delete VectorDB index -> immediately invisible to search +4. Delete FS file +5. Commit -> release lock -> delete journal +``` + +Rollback: Step 4 fails -> restore VectorDB records from snapshot. + +### mv(old_uri, new_uri) + +| Problem | Solution | +|---------|----------| +| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Transaction wrapper; rollback on failure | + +Transaction flow: + +``` +1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on source + POINT on destination) +2. Move FS file +3. Update VectorDB URIs +4. Commit -> release lock -> delete journal +``` + +Rollback: Step 3 fails -> move file back to original location. + +### add_resource (TreeBuilder.finalize_from_temp) + +| Problem | Solution | +|---------|----------| +| File moved from temp to final directory, then crash -> file exists but never searchable | Transaction wrapper for mv + post_action protects enqueue | + +Transaction flow: + +``` +1. Begin transaction, lock final_uri (lock_mode="point") +2. mv temp directory -> final location +3. Register post_action: enqueue SemanticQueue +4. Commit -> execute post_action -> release lock -> delete journal +``` + +Crash recovery: Journal records the post_action; replayed automatically on restart. + +### session.commit() + +| Problem | Solution | +|---------|----------| +| Messages cleared but archive not written -> conversation data lost | Split into two transactions + checkpoint | + +LLM calls have unpredictable latency (5s~60s+), so they cannot be inside a transaction. Split into: + +``` +Transaction 1 (Archive): + 1. Write archive (history/archive_N/messages.jsonl + summaries) + 2. Clear messages.jsonl + 3. Write checkpoint (status="archived") + 4. Commit + +LLM call (no transaction): + Extract memories from archived messages + +Transaction 2 (Memory write): + 1. Write memory files + 2. Write relations + 3. Update checkpoint (status="completed") + 4. Register post_action: enqueue SemanticQueue + 5. Commit +``` + +Crash recovery: Read checkpoint, resume from the appropriate step based on status. + +## TransactionContext + +`TransactionContext` is an **async** context manager that encapsulates the full transaction lifecycle: + +```python +from openviking.storage.transaction import TransactionContext, get_transaction_manager + +tx_manager = get_transaction_manager() + +async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx: + # Record undo (call before making changes) + seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot}) + # Execute change + delete_from_vector_store(uris) + # Mark completed + tx.mark_completed(seq) + + # Register post-commit action (optional) + tx.add_post_action("enqueue_semantic", {"uri": uri, ...}) + + # Commit + await tx.commit() +# Auto-rollback if commit() not called +``` + +**Lock modes**: + +| lock_mode | Use case | Behavior | +|-----------|----------|----------| +| `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors | +| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path and any lock on descendants | +| `mv` | Move operations | Acquire SUBTREE lock on source path, then POINT lock on destination path | + +## Lock Types (POINT vs SUBTREE) + +The lock mechanism uses two lock types to handle different conflict patterns: + +| | POINT on same path | SUBTREE on same path | POINT on descendant | SUBTREE on ancestor | +|---|---|---|---|---| +| **POINT** | Conflict | Conflict | — | Conflict | +| **SUBTREE** | Conflict | Conflict | Conflict | — | + +- **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock. +- **SUBTREE (S)**: Used for rm and mv-source operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants for conflicting locks. + +## Undo Log + +Each transaction maintains an Undo Log recording the reverse action for each step: + +| op_type | Forward operation | Rollback action | +|---------|-------------------|-----------------| +| `fs_mv` | Move file | Move back | +| `fs_rm` | Delete file | Skip (irreversible; rm is always the last step by design) | +| `fs_write_new` | Create new file/directory | Delete | +| `fs_mkdir` | Create directory | Delete | +| `vectordb_delete` | Delete index records | Restore from snapshot | +| `vectordb_upsert` | Insert index records | Delete | +| `vectordb_update_uri` | Update URI | Restore old value | + +Rollback rules: Only entries with `completed=True` are rolled back, in **reverse order**. Each step has independent try-catch (best-effort). During crash recovery, `recover_all=True` also reverses uncompleted entries to clean up partial operations. + +## Lock Mechanism + +### Lock Protocol + +Lock file path: `{path}/.path.ovlock` + +Lock file content (Fencing Token): +``` +{transaction_id}:{time_ns}:{lock_type} +``` + +Where `lock_type` is `P` (POINT) or `S` (SUBTREE). + +### Lock Acquisition (POINT mode) + +``` +loop until timeout (poll interval: 200ms): + 1. Check target directory exists + 2. Check if target directory is locked by another transaction + - Stale lock? -> remove and retry + - Active lock? -> wait + 3. Check all ancestor directories for SUBTREE locks + - Stale lock? -> remove and retry + - Active lock? -> wait + 4. Write POINT (P) lock file + 5. TOCTOU double-check: re-scan ancestors for SUBTREE locks + - Conflict found: compare (timestamp, tx_id) + - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock + - Wait and retry + 6. Verify lock file ownership (fencing token matches) + 7. Success + +Timeout (default 0 = no-wait) raises LockAcquisitionError +``` + +### Lock Acquisition (SUBTREE mode) + +``` +loop until timeout (poll interval: 200ms): + 1. Check target directory exists + 2. Check if target directory is locked by another transaction + 3. Scan all descendant directories for any locks by other transactions + 4. Write SUBTREE (S) lock file (only one file, at the root path) + 5. TOCTOU double-check: re-scan descendants for new locks + - Conflict found: later one backs off (livelock prevention) + 6. Verify lock file ownership + 7. Success +``` + +### Lock Expiry Cleanup + +**Stale lock detection**: PathLock checks the fencing token timestamp. Locks older than `lock_expire` (default 300s) are considered stale and are removed automatically during acquisition. + +**Transaction timeout**: TransactionManager checks active transactions every 60 seconds. Transactions with `updated_at` exceeding the transaction timeout (default 3600s) are rolled back. + +## Transaction Journal + +Each transaction persists a journal in AGFS: + +``` +/local/_system/transactions/{tx_id}/journal.json +``` + +Contains: transaction ID, status, lock paths, init_info, undo_log, post_actions. + +### Lifecycle + +``` +Create transaction -> write journal (INIT) +Acquire lock -> update journal (AQUIRE -> EXEC) +Execute changes -> update journal per step (mark undo entry completed) +Commit -> update journal (COMMIT + post_actions) + -> execute post_actions -> release locks -> delete journal +Rollback -> execute undo log -> release locks -> delete journal +``` + +## Crash Recovery + +`TransactionManager.start()` automatically scans for residual journals on startup: + +| Journal status at crash | Recovery action | +|------------------------|----------------| +| `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal | +| `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal | +| `EXEC` / `FAIL` / `RELEASING` | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal | +| `INIT` / `AQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) | + +### Defense Summary + +| Failure scenario | Defense | Recovery timing | +|-----------------|--------|-----------------| +| Crash during transaction | Journal + undo log rollback | On restart | +| Crash after commit, before enqueue | Journal post_actions replay | On restart | +| Crash after enqueue, before worker processes | QueueFS SQLite persistence | Worker auto-pulls after restart | +| Crash during session.commit LLM call | Checkpoint file recovery | On restart, re-invoke LLM | +| Orphan index | Cleaned on L2 on-demand load | When user accesses | +| Crash between lock creation and journal update | init_info records intended lock paths; recovery checks and cleans orphan locks | On restart | + +## Transaction State Machine + +``` +INIT -> AQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED + | + FAIL -> RELEASING -> RELEASED +``` + +- `INIT`: Transaction created, waiting for lock +- `AQUIRE`: Acquiring lock +- `EXEC`: Transaction operations executing +- `COMMIT`: Committed, post_actions may be pending +- `FAIL`: Execution failed, entering rollback +- `RELEASING`: Releasing locks +- `RELEASED`: Locks released, transaction complete + +## Configuration + +The transaction mechanism is enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section: + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0, + "max_parallel_locks": 8 + } + } +} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `lock_timeout` | float | Lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds. | `0.0` | +| `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` | +| `max_parallel_locks` | int | Max parallel locks for rm/mv operations | `8` | + +### QueueFS Persistence + +The transaction mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup. + +## Related Documentation + +- [Architecture](./01-architecture.md) - System architecture overview +- [Storage](./05-storage.md) - AGFS and vector store +- [Session Management](./08-session.md) - Session and memory management +- [Configuration](../guides/01-configuration.md) - Configuration reference diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index 764d7047..8c2c3420 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -479,7 +479,6 @@ Supports S3 storage in VirtualHostStyle mode, such as TOS. - #### vectordb Vector database storage configuration @@ -603,6 +602,30 @@ When `root_api_key` is configured, the server enables multi-tenant authenticatio For startup and deployment details see [Deployment](./03-deployment.md), for authentication see [Authentication](./04-authentication.md). +## storage.transaction Section + +The transaction mechanism is enabled by default and usually requires no configuration. **The default behavior is no-wait**: if the target path is already locked by another transaction, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry. + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0, + "max_parallel_locks": 8 + } + } +} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `lock_timeout` | float | Path lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds, then raise `LockAcquisitionError`. | `0.0` | +| `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` | +| `max_parallel_locks` | int | Max parallel locks during recursive locking for rm/mv operations | `8` | + +For details on the transaction mechanism, see [Transaction Mechanism](../concepts/09-transaction.md). + ## Full Schema ```json @@ -637,6 +660,11 @@ For startup and deployment details see [Deployment](./03-deployment.md), for aut "url": "string", "timeout": 10 }, + "transaction": { + "lock_timeout": 0.0, + "lock_expire": 300.0, + "max_parallel_locks": 8 + }, "vectordb": { "backend": "local|remote", "url": "string", diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 503ed683..99723042 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -1,167 +1,330 @@ # 事务机制 -OpenViking 的事务机制为 AI Agent 上下文数据库提供可靠的操作保障,解决数据一致性、并发控制和错误恢复等核心问题。 +OpenViking 的事务机制保护核心写操作(`rm`、`mv`、`add_resource`、`session.commit`)的一致性,确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。 -## 概览 +## 设计哲学 -``` -操作请求 → TransactionManager → 锁保护 → 执行操作 → 状态更新 - ↓ ↓ ↓ - 事务ID分配和事务状态管理 路径锁校验和加锁 +OpenViking 是上下文数据库,FS 是源数据,VectorDB 是派生索引。索引丢了可从源数据重建,源数据丢失不可恢复。因此: + +> **宁可搜不到,不要搜到坏结果。** + +## 设计原则 +1. **事务只覆盖同步部分**:FS + VectorDB 操作在事务内;SemanticQueue/EmbeddingQueue 的 enqueue 在事务提交后执行(post_actions),它们是幂等的,失败可重试 +2. **默认生效**:所有数据操作命令自动开启事务机制,用户无需额外配置 +3. **写互斥**:通过路径锁保证同一路径同一时间只有一个写事务 +4. **Undo Log 模型**:变更前记录反向操作,失败时反序执行回滚 +5. **事务日志持久化**:每个事务在 AGFS 中写入 journal 文件,支持崩溃恢复 -事务生命周期:开始操作 → 创建事务 → 锁保护生效 → 文件系统同步操作 → 摘要和索引异步操作 → 移除锁保护 → 事务结束 +## 架构 + +``` +Service Layer (rm / mv / add_resource / session.commit) + │ + ▼ +┌──[TransactionContext 异步上下文管理器]──┐ +│ │ +│ 1. 创建事务 + 写 journal │ +│ 2. 获取路径锁(轮询 + 超时) │ +│ 3. 执行操作(FS + VectorDB) │ +│ 4. 记录 Undo Log(每步完成后标记) │ +│ 5. Commit / Rollback │ +│ 6. 执行 post_actions(enqueue 等) │ +│ 7. 释放锁 + 清理 journal │ +│ │ +│ 异常时:反序执行 Undo Log → 释放锁 │ +└─────────────────────────────────────────┘ + │ + ▼ +Storage Layer (VikingFS, VectorDB, QueueManager) ``` -**设计原则**: -1. **最小化锁粒度**:仅支持路径锁机制,不实现复杂的 MVCC 等 -2. **写互斥优先**:暂不实现读锁(共享锁),先承诺写操作的互斥性 -3. **渐进式扩展**:避免过度设计,聚焦核心需求,未来需要时再添加更复杂的锁机制 -4. **默认生效**:所有数据操作命令均开启事务机制,用户无需额外配置 +## 一致性问题与解决方案 + +### rm(uri) -## 核心需求分析 +| 问题 | 方案 | +|------|------| +| 先删文件再删索引 → 文件已删但索引残留 → 搜索返回不存在的文件 | **调换顺序**:先删索引再删文件。索引删除失败 → 文件和索引都在,搜索正常 | -OpenViking 的数据操作命令(如 `add_resource`、`rm`、`mv` 等)存在以下无保护操作问题: +事务流程: -1. **并发冲突**:多个用户同时操作同一目录可能导致数据不一致 -2. **无原子性**:`add_resource` 多阶段操作中,某个阶段失败可能留下中间状态 -3. **无可观测性**:操作结果无法预测,用户无法直接观察到正在操作的状态 +``` +1. 开始事务,加锁(lock_mode="subtree") +2. 快照 VectorDB 中受影响的记录(用于回滚恢复) +3. 删除 VectorDB 索引 → 搜索立刻不可见 +4. 删除 FS 文件 +5. 提交 → 删锁 → 删 journal +``` + +回滚:第 4 步失败 → 从快照恢复 VectorDB 记录,文件和索引都在。 -## 系统一致性要求 +### mv(old_uri, new_uri) -从系统分析的角度,OpenViking 要求实现组件间的分布式一致性: +| 问题 | 方案 | +|------|------| +| 文件移到新路径但索引指向旧路径 → 搜索返回旧路径(不存在) | 事务包装,移动失败则回滚 | -1. **向量索引的最终一致**:所有上下文数据的向量表征依托独立的向量数据库或向量索引实现,要求确保在任何操作序列下,向量表示的更新都能实现最终一致 -2. **文件系统的读写一致性**:所有上下文数据的文件系统表示依托 VikingFS 实现,底层为 AGFS 桥接的分布式文件系统,要求确保在任何操作序列下,文件系统的更新都能保证数据不会损坏或丢失 -3. **队列和异步数据处理的一致性**:所有上下文数据的异步操作依托队列实现,要求确保在任何操作序列下,队列中的数据都能实现最终一致,即队列中的数据会最终被处理,不会丢失或重复 +事务流程: -## TransactionManager(事务管理器) +``` +1. 开始事务,加锁(lock_mode="mv",源路径 SUBTREE + 目标路径 POINT) +2. 移动 FS 文件 +3. 更新 VectorDB 中的 URI +4. 提交 → 删锁 → 删 journal +``` -TransactionManager 是全局单例,负责管理事务生命周期和锁机制实现。 +回滚:第 3 步失败 → 把文件移回原位。 -### 核心职责 +### add_resource (TreeBuilder.finalize_from_temp) -- 分配事务ID -- 管理事务生命周期(开始、提交、回滚) -- 提供事务的锁机制实现接口,防止死锁 +| 问题 | 方案 | +|------|------| +| 文件从临时目录移到正式目录后崩溃 → 文件存在但永远搜不到 | 事务包装 mv + post_action 保护 enqueue | -### 关键特性 +事务流程: ``` -路径锁 + 写互斥 = 并发冲突防护 +1. 开始事务,加锁(lock_mode="point",锁 final_uri) +2. mv 临时目录 → 正式位置 +3. 注册 post_action: enqueue SemanticQueue +4. 提交 → 执行 post_action → 删锁 → 删 journal ``` -- **路径锁**:锁定目标目录,防止并发的目录级操作如目录删除、目录移动等 -- **写互斥**:同一时间只允许一个事务写操作,路径锁机制确保所有写操作的互斥性 -- **事务结束状态**:事务有明确的结束状态,包括完成、失败丢弃等 +崩溃恢复:journal 中记录了 post_action,重启时自动重放 enqueue。 + +### session.commit() -### 事务状态机 +| 问题 | 方案 | +|------|------| +| 消息已清空但 archive 未写入 → 对话数据丢失 | 拆为两段事务 + checkpoint | + +LLM 调用耗时不可控(5s~60s+),放在事务内会长时间持锁。因此拆为: ``` -INIT → AQUIRE → EXEC → COMMIT/FAIL → RELEASING → RELEASED +第一段事务(归档): + 1. 写 archive(history/archive_N/messages.jsonl + 摘要) + 2. 清空 messages.jsonl + 3. 写 checkpoint(status="archived") + 4. 提交 + +LLM 调用(无事务): + 从归档消息提取 memories + +第二段事务(memory 写入): + 1. 写 memory 文件 + 2. 写 relations + 3. 更新 checkpoint(status="completed") + 4. 注册 post_action: enqueue SemanticQueue + 5. 提交 ``` -**状态说明**: -- `INIT`:事务初始化完成,等待锁获取 -- `AQUIRE`:正在获取锁资源 -- `EXEC`:事务操作正在执行 -- `COMMIT/FAIL`:事务执行完成,进入最终状态 -- `RELEASING`:正在释放锁资源 -- `RELEASED`:锁资源已完全释放,事务结束 +崩溃恢复:读 checkpoint,根据 status 决定从哪一步继续。 + +## TransactionContext -### 事务记录属性 +`TransactionContext` 是**异步**上下文管理器,封装事务的完整生命周期: ```python -TransactionRecord( - id: str, # 事务ID,采用 uuid 格式,唯一标识一个事务 - locks: List[str], # 锁列表 - status: str, # 当前状态 - init_info: Dict, # 事务初始化信息 - rollback_info: Dict, # 回滚信息 - created_at: float, # 创建时间 - updated_at: float, # 更新时间 -) +from openviking.storage.transaction import TransactionContext, get_transaction_manager + +tx_manager = get_transaction_manager() + +async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx: + # 记录 undo(变更前调用) + seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot}) + # 执行变更 + delete_from_vector_store(uris) + # 标记完成 + tx.mark_completed(seq) + + # 注册提交后动作(可选) + tx.add_post_action("enqueue_semantic", {"uri": uri, ...}) + + # 提交 + await tx.commit() +# 未 commit 时自动回滚 ``` -### 设计决策 +**锁模式**: -- 暂不实现共享锁(读锁),简化设计 -- 锁粒度仅限目录,不实现范围锁机制 -- 不实现复杂的死锁检测,通过超时机制防止死锁,事务超时后自动释放所有锁 -- 支持可选的自下而上并行加锁模式,提升大型目录树操作的性能和一致性 -- 事务状态机增加AQUIRE+RELEASING状态,明确跟踪锁释放过程,提高系统可观测性 +| lock_mode | 用途 | 行为 | +|-----------|------|------| +| `point` | 写操作 | 锁定指定路径;与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 | +| `subtree` | 删除操作 | 锁定子树根节点;与同路径的任何锁和后代目录的任何锁冲突 | +| `mv` | 移动操作 | 源路径加 SUBTREE 锁,目标路径加 POINT 锁 | -## 锁机制 +## 锁类型(POINT vs SUBTREE) + +锁机制使用两种锁类型来处理不同的冲突场景: + +| | 同路径 POINT | 同路径 SUBTREE | 后代 POINT | 祖先 SUBTREE | +|---|---|---|---|---| +| **POINT** | 冲突 | 冲突 | — | 冲突 | +| **SUBTREE** | 冲突 | 冲突 | 冲突 | — | + +- **POINT (P)**:用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。 +- **SUBTREE (S)**:用于删除和移动源操作。逻辑上覆盖整个子树,但只在根目录写**一个锁文件**。获取前扫描所有后代确认无冲突锁。 + +## Undo Log -锁机制是事务管理的核心组件,当前只提供路径锁类型。 +每个事务维护一个 Undo Log,记录每步操作的反向动作: -### 锁类型 +| op_type | 正向操作 | 回滚动作 | +|---------|---------|---------| +| `fs_mv` | 移动文件 | 移回原位 | +| `fs_rm` | 删除文件 | 跳过(不可逆,设计上 rm 是最后一步) | +| `fs_write_new` | 创建新文件/目录 | 删除 | +| `fs_mkdir` | 创建目录 | 删除 | +| `vectordb_delete` | 删除索引记录 | 从快照恢复 | +| `vectordb_upsert` | 插入索引记录 | 删除 | +| `vectordb_update_uri` | 更新 URI | 恢复旧值 | -| 锁类型 | 作用范围 | 用例 | -|--------|----------|------| -| 路径锁 | 整个目录 | 用于阻止目录被意外整体移动或删除,确保事务操作过程的路径合法性 +回滚规则:只回滚 `completed=True` 的条目,**反序执行**。每步独立 try-catch(best-effort)。崩溃恢复时使用 `recover_all=True`,也会回滚未完成的条目以清理部分操作残留。 + +## 锁机制 ### 锁协议 +锁文件路径:`{path}/.path.ovlock` + +锁文件内容(Fencing Token): ``` -viking://resources/github/volcengine/OpenViking/.path.ovlock +{transaction_id}:{time_ns}:{lock_type} ``` -- 锁文件存在即表示已加锁 -- 文件内容为事务ID,用于标识当前事务 -- 事务操作完成后,删除锁文件以释放锁 +其中 `lock_type` 为 `P`(POINT)或 `S`(SUBTREE)。 -### 加锁流程 +### 获取锁流程(POINT 模式) + +``` +循环直到超时(轮询间隔:200ms): + 1. 检查目标目录存在 + 2. 检查目标路径是否被其他事务锁定 + - 陈旧锁? → 移除后重试 + - 活跃锁? → 等待 + 3. 检查所有祖先目录是否有 SUBTREE 锁 + - 陈旧锁? → 移除后重试 + - 活跃锁? → 等待 + 4. 写入 POINT (P) 锁文件 + 5. TOCTOU 双重检查:重新扫描祖先目录的 SUBTREE 锁 + - 发现冲突:比较 (timestamp, tx_id) + - 后到者(更大的 timestamp/tx_id)主动让步(删除自己的锁),防止活锁 + - 等待后重试 + 6. 验证锁文件归属(fencing token 匹配) + 7. 成功 + +超时(默认 0 = 不等待)抛出 LockAcquisitionError +``` -#### 普通操作加锁流程 +### 获取锁流程(SUBTREE 模式) ``` -1. 检查目标目录是否存在 -2. 检查目标目录是否已被其他事务锁定 -3. 检查目标目录的父目录是否已被其他事务锁定 -4. 创建 .path.ovlock 文件,文件内容为事务ID -5. 再次检查目标目录的父目录是否已被其他事务锁定 -6. 读取刚创建的 .path.ovlock 文件内容,确认为当前事务ID -7. 一切正常,则返回加锁成功 +循环直到超时(轮询间隔:200ms): + 1. 检查目标目录存在 + 2. 检查目标路径是否被其他事务锁定 + 3. 扫描所有后代目录,检查是否有其他事务持有的锁 + 4. 写入 SUBTREE (S) 锁文件(只写一个文件,在根路径) + 5. TOCTOU 双重检查:重新扫描后代目录 + - 发现冲突:后到者主动让步(活锁防止) + 6. 验证锁文件归属 + 7. 成功 ``` -#### rm 操作加锁流程 +### 锁过期清理 +**陈旧锁检测**:PathLock 检查 fencing token 中的时间戳。超过 `lock_expire`(默认 300s)的锁被视为陈旧锁,在加锁过程中自动移除。 + +**事务超时**:TransactionManager 每 60 秒检查活跃事务,`updated_at` 超过事务超时时间(默认 3600s)的事务强制回滚。 + +## 事务日志(Journal) + +每个事务在 AGFS 持久化一份 journal: + +``` +/local/_system/transactions/{tx_id}/journal.json ``` -# 传统串行模式:存在更大的竞态条件窗口 -1. 检查目标目录是否存在 -2. 检查目标目录是否已被其他事务锁定 -3. 检查目标目录的父目录是否已被其他事务锁定 -4. 在目标目录下创建 .path.ovlock 文件,文件内容为事务ID -5. 递归地在目标目录的所有子目录下创建 .path.ovlock 文件 -6. 如果发生加锁失败,移除所有已经创建的 .path.ovlock 文件 -7. 一切正常,则返回加锁成功 -# 自下而上并行模式 -1. 并行遍历整个目录树,收集所有子目录路径 -2. 按照目录层级从深到浅排序,从最深层子目录开始 -3. 以有限并行度(默认最大8)批量创建 .path.ovlock 文件 -4. 最后锁定目标目录 -5. 如果任一位置加锁失败,逆序移除所有已经创建的 .path.ovlock 文件 +内容包含:事务 ID、状态、锁路径、init_info、undo_log、post_actions。 + +### 生命周期 + +``` +创建事务 → 写 journal(INIT) +获取锁 → 更新 journal(AQUIRE → EXEC) +执行变更 → 每步更新 journal(标记 undo entry completed) +提交 → 更新 journal(COMMIT + post_actions) + → 执行 post_actions → 删锁 → 删 journal +回滚 → 执行 undo log → 删锁 → 删 journal ``` -#### mv 操作加锁流程 +## 崩溃恢复 + +`TransactionManager.start()` 启动时自动扫描残留 journal: + +| 崩溃时 journal 状态 | 恢复方式 | +|---------------------|---------| +| `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal | +| `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal | +| `EXEC` / `FAIL` / `RELEASING` | 执行 undo log 回滚(`recover_all=True`) → 删锁 → 删 journal | +| `INIT` / `AQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal(变更未执行) | + +### 防线总结 + +| 异常场景 | 防线 | 恢复时机 | +|---------|------|---------| +| 事务内崩溃 | journal + undo log 回滚 | 重启时 | +| 提交后 enqueue 前崩溃 | journal post_actions 重放 | 重启时 | +| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后自动拉取 | +| session.commit LLM 调用中崩溃 | checkpoint 文件恢复 | 重启时重新调用 LLM | +| 孤儿索引 | L2 按需加载时清理 | 用户访问时 | +| 加锁后 journal 更新前崩溃 | init_info 记录预期锁路径,恢复时检查并清理孤儿锁 | 重启时 | + +## 事务状态机 ``` -1. 先参照 rm 操作对原目录进行加锁 -2. 再参照普通操作过程对新目录进行加锁 +INIT → AQUIRE → EXEC → COMMIT → RELEASING → RELEASED + ↓ + FAIL → RELEASING → RELEASED ``` -### 锁机制性能分析 +- `INIT`:事务已创建,等待锁获取 +- `AQUIRE`:正在获取锁 +- `EXEC`:事务操作执行中 +- `COMMIT`:已提交,可能有 post_actions 待执行 +- `FAIL`:执行失败,进入回滚 +- `RELEASING`:正在释放锁 +- `RELEASED`:锁已释放,事务结束 + +## 配置 + +事务机制默认启用,无需额外配置。**默认不等待**:若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试,可通过 `storage.transaction` 段配置: + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0, + "max_parallel_locks": 8 + } + } +} +``` + +| 参数 | 类型 | 说明 | 默认值 | +|------|------|------|--------| +| `lock_timeout` | float | 获取锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间 | `0.0` | +| `lock_expire` | float | 锁过期时间(秒),超过此时间的事务锁将被视为陈旧锁并强制释放 | `300.0` | +| `max_parallel_locks` | int | rm/mv 操作的最大并行加锁数 | `8` | + +### QueueFS 持久化 -- 并行遍历采用广度优先策略,同时处理同一层级的所有目录 -- 并行加锁从最深层开始,逐层向上锁定,确保整个目录树的一致性 -- 有限并行度(默认最大8)避免AGFS服务过载 -- 加锁失败时采用逆序回滚,确保所有已加锁目录都能正确释放 -- 事务状态机明确区分锁管理过程(AQUIRE+RELEASING状态),提高系统可观测性和调试效率 +事务机制依赖 QueueFS 使用 SQLite 后端,确保 enqueue 的任务在进程重启后可恢复。这是默认配置,无需手动设置。 ## 相关文档 - [架构概述](./01-architecture.md) - 系统整体架构 - [存储架构](./05-storage.md) - AGFS 和向量库 -- [会话管理](./08-session.md) - 会话和记忆管理 \ No newline at end of file +- [会话管理](./08-session.md) - 会话和记忆管理 +- [配置](../guides/01-configuration.md) - 配置文件说明 diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index 457738cc..ffe71183 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -487,10 +487,9 @@ AST 提取支持:Python、JavaScript/TypeScript、Rust、Go、Java、C/C++。 - #### vectordb -向量库存储的配置 +向量库存储的配置 | 参数 | 类型 | 说明 | 默认值 | |------|------|------|--------| @@ -612,6 +611,30 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 启动方式和部署详情见 [服务部署](./03-deployment.md),认证详情见 [认证](./04-authentication.md)。 +## storage.transaction 段 + +事务机制默认启用,通常无需配置。**默认行为是不等待**:若目标路径已被其他事务锁定,操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试,请将 `lock_timeout` 设为正数。 + +```json +{ + "storage": { + "transaction": { + "lock_timeout": 5.0, + "lock_expire": 300.0, + "max_parallel_locks": 8 + } + } +} +``` + +| 参数 | 类型 | 说明 | 默认值 | +|------|------|------|--------| +| `lock_timeout` | float | 获取路径锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间后抛出 `LockAcquisitionError` | `0.0` | +| `lock_expire` | float | 锁过期时间(秒)。超过此时间的事务锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` | +| `max_parallel_locks` | int | rm/mv 操作递归加锁时的最大并行数 | `8` | + +事务机制的详细说明见 [事务机制](../concepts/09-transaction.md)。 + ## 完整 Schema ```json @@ -646,6 +669,11 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 "url": "string", "timeout": 10 }, + "transaction": { + "lock_timeout": 0.0, + "lock_expire": 300.0, + "max_parallel_locks": 8 + }, "vectordb": { "backend": "local|remote", "url": "string", diff --git a/openviking/agfs_manager.py b/openviking/agfs_manager.py index 14ed124a..9ae796f2 100644 --- a/openviking/agfs_manager.py +++ b/openviking/agfs_manager.py @@ -133,9 +133,23 @@ def _generate_config(self) -> Path: "version": "1.0.0", }, }, + # TODO(multi-node): SQLite backend is single-node only. Each AGFS instance + # gets its own isolated queue.db under its own data_path, so messages + # enqueued on node A are invisible to node B. For multi-node deployments, + # switch backend to "tidb" or "mysql" so all nodes share the same queue. + # + # Additionally, the TiDB backend currently uses immediate soft-delete on + # Dequeue (no two-phase status='processing' transition), meaning there is + # no at-least-once guarantee: a worker crash loses the in-flight message. + # The TiDB backend's Ack() and RecoverStale() are both no-ops and must be + # implemented before it can be used safely in production. "queuefs": { "enabled": True, "path": "/queue", + "config": { + "backend": "sqlite", + "db_path": str(self.data_path / "_system" / "queue" / "queue.db"), + }, }, }, } @@ -196,6 +210,7 @@ def start(self) -> None: self._check_port_available() self.vikingfs_path.mkdir(parents=True, exist_ok=True) + (self.data_path / "_system" / "queue").mkdir(parents=True, exist_ok=True) # NOTICE: should use viking://temp/ instead of self.vikingfs_path / "temp" # Create temp directory for Parser use # (self.vikingfs_path / "temp").mkdir(exist_ok=True) diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 820ca554..00e1e354 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -163,9 +163,53 @@ async def finalize_from_temp( else: logger.info(f"[TreeBuilder] Finalizing from temp: {final_uri}") - # 4. Move directory tree from temp to final location in AGFS - await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx) - logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}") + # 4. Move directory tree from temp to final location in AGFS (transactional) + from openviking.storage.transaction import TransactionContext, get_transaction_manager + + tx_manager = get_transaction_manager() + final_path = viking_fs._uri_to_path(final_uri, ctx=ctx) + # Lock parent directory (final_path doesn't exist yet) + parent_path = final_path.rsplit("/", 1)[0] if "/" in final_path else final_path + + if tx_manager: + # Ensure parent directories exist before locking + await self._ensure_parent_dirs(final_uri, ctx=ctx) + + async with TransactionContext( + tx_manager, "finalize_from_temp", [parent_path], lock_mode="point" + ) as tx: + # Move temp to final + seq = tx.record_undo("fs_write_new", {"uri": final_path}) + await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx) + tx.mark_completed(seq) + logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}") + + # Register semantic enqueue as post_action + tx.add_post_action( + "enqueue_semantic", + { + "uri": final_uri, + "context_type": "resource", + "account_id": ctx.account_id, + "user_id": ctx.user.user_id, + "agent_id": ctx.user.agent_id, + "role": ctx.role.value, + }, + ) + + await tx.commit() + else: + # Fallback: no transaction support + await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx) + logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}") + + try: + await self._enqueue_semantic_generation(final_uri, "resource", ctx=ctx) + logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}") + except Exception as e: + logger.error( + f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True + ) # 5. Cleanup temporary root directory try: @@ -174,21 +218,13 @@ async def finalize_from_temp( except Exception as e: logger.warning(f"[TreeBuilder] Failed to cleanup temp root: {e}") - # 6. Enqueue to SemanticQueue for async semantic generation - if trigger_semantic: - try: - await self._enqueue_semantic_generation(final_uri, "resource", ctx=ctx) - logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}") - except Exception as e: - logger.error(f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True) - - # 7. Return simple BuildingTree (no scanning needed) + # 6. Return simple BuildingTree (no scanning needed) tree = BuildingTree( source_path=source_path, source_format=source_format, ) tree._root_uri = final_uri - + # Create a minimal Context object for the root so that tree.root is not None root_context = Context(uri=final_uri) tree.add_context(root_context) diff --git a/openviking/service/core.py b/openviking/service/core.py index 7b9a35c0..c1c62c73 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -139,12 +139,20 @@ def _init_storage( vectordb_config=config.vectordb, queue_manager=self._queue_manager ) - # Configure queues if QueueManager is available + # Configure queues if QueueManager is available. + # Workers are NOT started here — start() is called after VikingFS is initialized + # in initialize(), so that recovered tasks don't race against VikingFS init. if self._queue_manager: - self._queue_manager.setup_standard_queues(self._vikingdb_manager) + self._queue_manager.setup_standard_queues(self._vikingdb_manager, start=False) # Initialize TransactionManager - self._transaction_manager = init_transaction_manager(agfs_config=config.agfs) + tx_cfg = config.transaction + self._transaction_manager = init_transaction_manager( + agfs_config=config.agfs, + max_parallel_locks=tx_cfg.max_parallel_locks, + lock_timeout=tx_cfg.lock_timeout, + lock_expire=tx_cfg.lock_expire, + ) @property def viking_fs(self) -> Optional[VikingFS]: @@ -240,6 +248,14 @@ async def initialize(self) -> None: if enable_recorder: logger.info("VikingFS IO Recorder enabled") + # Start queue workers now that VikingFS is ready. + # Doing it here (rather than in _init_storage) ensures that any tasks + # recovered from a previous crash are not processed before VikingFS is + # initialized, which would cause "VikingFS not initialized" errors. + if self._queue_manager: + self._queue_manager.start() + logger.info("QueueManager workers started") + # Initialize directories directory_initializer = DirectoryInitializer(vikingdb=self._vikingdb_manager) self._directory_initializer = directory_initializer diff --git a/openviking/session/session.py b/openviking/session/session.py index 62679223..0bbc94aa 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -219,7 +219,14 @@ def update_tool_part( self._update_message_in_jsonl() def commit(self) -> Dict[str, Any]: - """Commit session: create archive, extract memories, persist.""" + """Commit session: two-phase transaction with checkpoint. + + Phase 1 (Archive): Lock session, write archive, clear messages, write checkpoint. + LLM call (no transaction): Extract long-term memories. + Phase 2 (Memory): Lock session, write memories + relations, update checkpoint. + """ + from openviking.storage.transaction import get_transaction_manager + result = { "session_id": self.session_id, "status": "committed", @@ -231,7 +238,10 @@ def commit(self) -> Dict[str, Any]: if not self._messages: return result - # 1. Archive current messages + tx_manager = get_transaction_manager() + session_path = self._viking_fs._uri_to_path(self._session_uri, ctx=self.ctx) + + # ===== Phase 1: Archive ===== self._compression.compression_index += 1 messages_to_archive = self._messages.copy() @@ -239,22 +249,35 @@ def commit(self) -> Dict[str, Any]: archive_abstract = self._extract_abstract_from_summary(summary) archive_overview = summary - self._write_archive( - index=self._compression.compression_index, - messages=messages_to_archive, - abstract=archive_abstract, - overview=archive_overview, - ) + if tx_manager: + run_async( + self._phase1_archive_async( + tx_manager, + session_path, + self._compression.compression_index, + messages_to_archive, + archive_abstract, + archive_overview, + ) + ) + else: + self._write_archive( + index=self._compression.compression_index, + messages=messages_to_archive, + abstract=archive_abstract, + overview=archive_overview, + ) + self._write_to_agfs(messages=[]) self._compression.original_count += len(messages_to_archive) result["archived"] = True - self._messages.clear() logger.info( - f"Archived: {len(messages_to_archive)} messages → history/archive_{self._compression.compression_index:03d}/" + f"Archived: {len(messages_to_archive)} messages → " + f"history/archive_{self._compression.compression_index:03d}/" ) - # 2. Extract long-term memories + # ===== LLM call (no transaction) ===== if self._session_compressor: logger.info( f"Starting memory extraction from {len(messages_to_archive)} archived messages" @@ -271,17 +294,18 @@ def commit(self) -> Dict[str, Any]: result["memories_extracted"] = len(memories) self._stats.memories_extracted += len(memories) - # 3. Write current messages to AGFS - self._write_to_agfs(self._messages) + # ===== Phase 2: Memory write ===== + if tx_manager: + run_async(self._phase2_memory_async(tx_manager, session_path)) + else: + self._write_to_agfs(self._messages) + self._write_relations() - # 4. Create relations - self._write_relations() - - # 5. Update active_count + # Update active_count active_count_updated = self._update_active_counts() result["active_count_updated"] = active_count_updated - # 6. Update statistics + # Update statistics self._stats.compression_count = self._compression.compression_index result["stats"] = { "total_turns": self._stats.total_turns, @@ -294,6 +318,58 @@ def commit(self) -> Dict[str, Any]: logger.info(f"Session {self.session_id} committed") return result + async def _phase1_archive_async( + self, + tx_manager: Any, + session_path: str, + compression_index: int, + messages_to_archive: list, + archive_abstract: str, + archive_overview: str, + ) -> None: + """Phase 1 of commit: archive messages inside a transaction.""" + from openviking.storage.transaction import TransactionContext + + async with TransactionContext( + tx_manager, "session_archive", [session_path], lock_mode="point" + ) as tx: + seq = tx.record_undo("fs_write_new", {"uri": session_path}) + self._write_archive( + index=compression_index, + messages=messages_to_archive, + abstract=archive_abstract, + overview=archive_overview, + ) + self._write_to_agfs(messages=[]) + self._write_checkpoint({"status": "archived", "archive_index": compression_index}) + tx.mark_completed(seq) + await tx.commit() + + async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None: + """Phase 2 of commit: write memories inside a transaction.""" + from openviking.storage.transaction import TransactionContext + + async with TransactionContext( + tx_manager, "session_memory", [session_path], lock_mode="point" + ) as tx: + seq = tx.record_undo("fs_write_new", {"uri": session_path}) + self._write_to_agfs(self._messages) + self._write_relations() + self._write_checkpoint({"status": "completed"}) + tx.mark_completed(seq) + tx.add_post_action( + "enqueue_semantic", + { + "uri": self._session_uri, + "context_type": "memory", + "account_id": self.ctx.account_id, + "user_id": self.ctx.user.user_id, + "agent_id": self.ctx.user.agent_id, + "role": self.ctx.role.value, + }, + ) + await tx.commit() + def _update_active_counts(self) -> int: """Update active_count for used contexts/skills.""" if not self._vikingdb_manager: @@ -581,6 +657,39 @@ def _write_relations(self) -> None: except Exception as e: logger.warning(f"Failed to create relation to {usage.uri}: {e}") + def _write_checkpoint(self, data: Dict[str, Any]) -> None: + """Write a commit checkpoint file for crash recovery.""" + if not self._viking_fs: + return + + checkpoint = { + **data, + "session_id": self.session_id, + "compression_index": self._compression.compression_index, + "timestamp": get_current_timestamp(), + } + run_async( + self._viking_fs.write_file( + f"{self._session_uri}/.commit_checkpoint.json", + json.dumps(checkpoint, ensure_ascii=False), + ctx=self.ctx, + ) + ) + + def _read_checkpoint(self) -> Optional[Dict[str, Any]]: + """Read commit checkpoint file if it exists.""" + if not self._viking_fs: + return None + try: + content = run_async( + self._viking_fs.read_file( + f"{self._session_uri}/.commit_checkpoint.json", ctx=self.ctx + ) + ) + return json.loads(content) + except Exception: + return None + # ============= Properties ============= @property diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py index bc3e36be..7f6a483b 100644 --- a/openviking/storage/errors.py +++ b/openviking/storage/errors.py @@ -29,3 +29,15 @@ class ConnectionError(StorageException): class SchemaError(StorageException): """Raised when schema validation fails.""" + + +class TransactionError(VikingDBException): + """Raised when a transaction operation fails.""" + + +class LockAcquisitionError(TransactionError): + """Raised when lock acquisition fails.""" + + +class TransactionRollbackError(TransactionError): + """Raised when transaction rollback fails.""" diff --git a/openviking/storage/queuefs/named_queue.py b/openviking/storage/queuefs/named_queue.py index ca0e9b29..495a284b 100644 --- a/openviking/storage/queuefs/named_queue.py +++ b/openviking/storage/queuefs/named_queue.py @@ -198,6 +198,21 @@ async def enqueue(self, data: Union[str, Dict[str, Any]]) -> str: msg_id = self._agfs.write(enqueue_file, data.encode("utf-8")) return msg_id if isinstance(msg_id, str) else str(msg_id) + async def ack(self, msg_id: str) -> None: + """Acknowledge successful processing of a message (deletes it from persistent storage). + + Must be called after the dequeue handler finishes processing a message. + If not called (e.g. process crashes), the message will be automatically + re-queued on the next startup via RecoverStale. + """ + if not msg_id: + return + ack_file = f"{self.path}/ack" + try: + self._agfs.write(ack_file, msg_id.encode("utf-8")) + except Exception as e: + logger.warning(f"[NamedQueue] Ack failed for {self.name} msg_id={msg_id}: {e}") + def _read_queue_message(self) -> Optional[Dict[str, Any]]: """Read and remove one message from the AGFS queue; return parsed dict or None. @@ -217,15 +232,30 @@ def _read_queue_message(self) -> Optional[Dict[str, Any]]: return json.loads(raw.decode("utf-8")) async def dequeue(self) -> Optional[Dict[str, Any]]: - """Get and remove message from queue, then invoke the dequeue handler.""" + """Dequeue a message, process it, then ack to confirm deletion. + + Flow (at-least-once delivery): + 1. Read from /dequeue → backend marks message as 'processing' (not deleted yet) + 2. Call on_dequeue() → actual processing + 3. Call ack() → backend deletes the message permanently + + If the process crashes between steps 1 and 3, the backend's RecoverStale + on the next startup resets the message back to 'pending' for retry. + """ await self._ensure_initialized() try: data = self._read_queue_message() if data is None: return None + # Capture message ID before passing data to handler (handler may modify it) + msg_id = data.get("id", "") if isinstance(data, dict) else "" if self._dequeue_handler: self._on_dequeue_start() data = await self._dequeue_handler.on_dequeue(data) + # Ack unconditionally after handler returns (success or handled error). + # If on_dequeue raises, the exception propagates and ack is skipped — + # the message will be recovered on next startup. + await self.ack(msg_id) return data except Exception as e: logger.debug(f"[NamedQueue] Dequeue failed for {self.name}: {e}") diff --git a/openviking/storage/queuefs/queue_manager.py b/openviking/storage/queuefs/queue_manager.py index 95e9aeb2..b5a68af4 100644 --- a/openviking/storage/queuefs/queue_manager.py +++ b/openviking/storage/queuefs/queue_manager.py @@ -107,16 +107,16 @@ def start(self) -> None: logger.info("[QueueManager] Started") - def setup_standard_queues(self, vector_store: Any) -> None: + def setup_standard_queues(self, vector_store: Any, start: bool = True) -> None: """ Setup standard queues (Embedding and Semantic) with their handlers. - This method initializes the EmbeddingQueue with TextEmbeddingHandler - and the SemanticQueue with SemanticProcessor, then ensures the - queue manager is started. - Args: vector_store: Vector store instance for handlers to write results. + start: Whether to start worker threads immediately (default True). + Pass False when the consumer depends on resources that are + not yet initialized (e.g. VikingFS); call start() manually + after those resources are ready. """ # Import handlers here to avoid circular dependencies from openviking.storage.collection_schemas import TextEmbeddingHandler @@ -140,8 +140,8 @@ def setup_standard_queues(self, vector_store: Any) -> None: ) logger.info("Semantic queue initialized with SemanticProcessor") - # Start QueueManager processing - self.start() + if start: + self.start() def _start_queue_worker(self, queue: NamedQueue) -> None: """Start a dedicated worker thread for a queue if not already running.""" @@ -207,10 +207,14 @@ async def _worker_async_concurrent( async def process_one(data: Dict[str, Any]) -> None: async with sem: + msg_id = data.get("id", "") if isinstance(data, dict) else "" try: await queue.process_dequeued(data) + # Ack after successful processing (delete from persistent storage). + await queue.ack(msg_id) except Exception as e: # Handler did not call report_error; decrement in_progress manually. + # Do NOT ack — let RecoverStale re-queue on next startup. queue._on_process_error(str(e), data) logger.error(f"[QueueManager] Concurrent worker error for {queue.name}: {e}") @@ -280,9 +284,6 @@ def get_queue( allow_create: bool = False, ) -> NamedQueue: """Get or create a named queue object.""" - if not self._started: - self.start() - if name not in self._queues: if not allow_create: raise RuntimeError(f"Queue {name} does not exist and allow_create is False") diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 0307521f..0e894474 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -238,6 +238,9 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]: return results async def _overview_task(self, dir_uri: str) -> None: + from openviking.storage.errors import LockAcquisitionError + from openviking.storage.transaction import TransactionContext, get_transaction_manager + node = self._nodes.get(dir_uri) if not node: return @@ -246,26 +249,30 @@ async def _overview_task(self, dir_uri: str) -> None: file_summaries = self._finalize_file_summaries(node) children_abstracts = self._finalize_children_abstracts(node) - try: - async with self._llm_sem: - overview = await self._processor._generate_overview( - dir_uri, file_summaries, children_abstracts - ) - abstract = self._processor._extract_abstract_from_overview(overview) + abstract = "" + dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx) - try: + try: + async with TransactionContext( + get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point" + ) as tx: + async with self._llm_sem: + overview = await self._processor._generate_overview( + dir_uri, file_summaries, children_abstracts + ) + abstract = self._processor._extract_abstract_from_overview(overview) await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx) await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx) - except Exception as e: - logger.warning(f"Failed to write overview/abstract for {dir_uri}: {e}") - - try: - await self._processor._vectorize_directory_simple( - dir_uri, self._context_type, abstract, overview, ctx=self._ctx - ) - except Exception as e: - logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True) - + try: + await self._processor._vectorize_directory_simple( + dir_uri, self._context_type, abstract, overview, ctx=self._ctx + ) + except Exception as e: + logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True) + await tx.commit() + except LockAcquisitionError: + logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping") + abstract = "" except Exception as e: logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True) abstract = "" diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 59700783..24e485ed 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -237,33 +237,56 @@ async def _process_single_directory( file_paths: List[str], ) -> None: """Process single directory, generate .abstract.md and .overview.md.""" + from openviking.storage.errors import LockAcquisitionError + from openviking.storage.transaction import TransactionContext, get_transaction_manager + viking_fs = get_viking_fs() + dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx) - # 1. Collect .abstract.md from subdirectories (already processed earlier) - children_abstracts = await self._collect_children_abstracts(children_uris) + try: + async with TransactionContext( + get_transaction_manager(), "semantic", [dir_path], lock_mode="point" + ) as tx: + # 1. Collect .abstract.md from subdirectories + children_abstracts = await self._collect_children_abstracts(children_uris) + + # 2. Generate file summaries (vectorize inline, not via enqueue) + file_summaries = await self._generate_file_summaries( + file_paths, context_type=context_type, parent_uri=uri, enqueue_files=False + ) - # 2. Concurrently generate summaries for files in directory - file_summaries = await self._generate_file_summaries( - file_paths, context_type=context_type, parent_uri=uri, enqueue_files=True - ) + # 3. Generate .overview.md + overview = await self._generate_overview(uri, file_summaries, children_abstracts) - # 3. Generate .overview.md (contains brief description) - overview = await self._generate_overview(uri, file_summaries, children_abstracts) + # 4. Extract abstract from overview + abstract = self._extract_abstract_from_overview(overview) - # 4. Extract abstract from overview - abstract = self._extract_abstract_from_overview(overview) + # 5. Write files + await viking_fs.write_file(f"{uri}/.overview.md", overview, ctx=self._current_ctx) + await viking_fs.write_file(f"{uri}/.abstract.md", abstract, ctx=self._current_ctx) - # 5. Write files - await viking_fs.write_file(f"{uri}/.overview.md", overview, ctx=self._current_ctx) - await viking_fs.write_file(f"{uri}/.abstract.md", abstract, ctx=self._current_ctx) + logger.debug(f"Generated overview and abstract for {uri}") - logger.debug(f"Generated overview and abstract for {uri}") + # 6. Vectorize directory and files (all inside the lock) + try: + await self._vectorize_directory_simple(uri, context_type, abstract, overview) + except Exception as e: + logger.error(f"Failed to vectorize directory {uri}: {e}", exc_info=True) + + for fp, summary in zip(file_paths, file_summaries): + try: + await self._vectorize_single_file( + parent_uri=uri, + context_type=context_type, + file_path=fp, + summary_dict=summary, + ) + except Exception as e: + logger.error(f"Failed to vectorize file {fp}: {e}", exc_info=True) - # 6. Vectorize directory - try: - await self._vectorize_directory_simple(uri, context_type, abstract, overview) - except Exception as e: - logger.error(f"Failed to vectorize directory {uri}: {e}", exc_info=True) + await tx.commit() + except LockAcquisitionError: + logger.info(f"[SemanticProcessor] {uri} does not exist or is locked, skipping") async def _collect_children_abstracts(self, children_uris: List[str]) -> List[Dict[str, str]]: """Collect .abstract.md from subdirectories.""" diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py index b6c06d6e..2730cd2e 100644 --- a/openviking/storage/transaction/__init__.py +++ b/openviking/storage/transaction/__init__.py @@ -6,6 +6,8 @@ Provides transaction management and lock mechanisms for data operations. """ +from openviking.storage.transaction.context_manager import TransactionContext +from openviking.storage.transaction.journal import TransactionJournal from openviking.storage.transaction.path_lock import PathLock from openviking.storage.transaction.transaction_manager import ( TransactionManager, @@ -16,12 +18,17 @@ TransactionRecord, TransactionStatus, ) +from openviking.storage.transaction.undo import UndoEntry, execute_rollback __all__ = [ "PathLock", + "TransactionContext", + "TransactionJournal", "TransactionManager", "TransactionRecord", "TransactionStatus", - "init_transaction_manager", + "UndoEntry", + "execute_rollback", "get_transaction_manager", + "init_transaction_manager", ] diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py new file mode 100644 index 00000000..10107dde --- /dev/null +++ b/openviking/storage/transaction/context_manager.py @@ -0,0 +1,146 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Transaction context manager for OpenViking. + +Provides an async context manager that wraps a set of operations in a +transaction with automatic rollback on failure. +""" + +from typing import Any, Dict, List, Optional + +from openviking.storage.errors import LockAcquisitionError, TransactionError +from openviking.storage.transaction.transaction_record import TransactionRecord +from openviking.storage.transaction.undo import UndoEntry +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class TransactionContext: + """Async context manager for transactional operations. + + Usage:: + + async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx: + seq = tx.record_undo("fs_rm", {"uri": uri}) + # ... do work ... + tx.mark_completed(seq) + await tx.commit() + """ + + def __init__( + self, + tx_manager: Any, + operation: str, + lock_paths: List[str], + lock_mode: str = "point", + mv_dst_path: Optional[str] = None, + ): + self._tx_manager = tx_manager + self._operation = operation + self._lock_paths = lock_paths + self._lock_mode = lock_mode + self._mv_dst_path = mv_dst_path + self._record: Optional[TransactionRecord] = None + self._committed = False + self._sequence = 0 + + @property + def record(self) -> TransactionRecord: + if self._record is None: + raise TransactionError("Transaction not started") + return self._record + + async def __aenter__(self) -> "TransactionContext": + self._record = self._tx_manager.create_transaction( + init_info={ + "operation": self._operation, + "lock_paths": self._lock_paths, + "lock_mode": self._lock_mode, + "mv_dst_path": self._mv_dst_path, + } + ) + tx_id = self._record.id + + # Write journal BEFORE acquiring locks so that crash recovery can + # find orphan locks via init_info even if the process dies between + # lock creation and journal update. + try: + self._tx_manager.journal.write(self._record.to_journal()) + except Exception as e: + logger.warning(f"[Transaction] Failed to write journal for {tx_id}: {e}") + + success = False + if self._lock_mode == "subtree": + for path in self._lock_paths: + success = await self._tx_manager.acquire_lock_subtree(tx_id, path) + if not success: + break + elif self._lock_mode == "mv": + if len(self._lock_paths) < 1 or not self._mv_dst_path: + raise TransactionError("mv lock mode requires lock_paths[0] and mv_dst_path") + success = await self._tx_manager.acquire_lock_mv( + tx_id, self._lock_paths[0], self._mv_dst_path + ) + else: + # "point" mode (default) + for path in self._lock_paths: + success = await self._tx_manager.acquire_lock_point(tx_id, path) + if not success: + break + + if not success: + await self._tx_manager.rollback(tx_id) + raise LockAcquisitionError( + f"Failed to acquire {self._lock_mode} lock for {self._lock_paths}" + ) + + # Update journal with actual lock paths now populated in the record. + try: + self._tx_manager.journal.update(self._record.to_journal()) + except Exception as e: + logger.warning(f"[Transaction] Failed to update journal for {tx_id}: {e}") + + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if not self._committed: + try: + await self._tx_manager.rollback(self._record.id) + except Exception as e: + logger.error(f"Rollback failed during __aexit__: {e}") + return False + + def record_undo(self, op_type: str, params: Dict[str, Any]) -> int: + seq = self._sequence + self._sequence += 1 + entry = UndoEntry(sequence=seq, op_type=op_type, params=params) + self.record.undo_log.append(entry) + + try: + self._tx_manager.journal.update(self.record.to_journal()) + except Exception: + pass + + return seq + + def mark_completed(self, sequence: int) -> None: + for entry in self.record.undo_log: + if entry.sequence == sequence: + entry.completed = True + break + + try: + self._tx_manager.journal.update(self.record.to_journal()) + except Exception: + pass + + def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None: + self.record.post_actions.append({"type": action_type, "params": params}) + + async def commit(self) -> None: + self._committed = True + success = await self._tx_manager.commit(self._record.id) + if not success: + raise TransactionError(f"Failed to commit transaction {self._record.id}") diff --git a/openviking/storage/transaction/journal.py b/openviking/storage/transaction/journal.py new file mode 100644 index 00000000..d641e905 --- /dev/null +++ b/openviking/storage/transaction/journal.py @@ -0,0 +1,114 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Transaction journal for crash recovery. + +Persists transaction state to AGFS so that incomplete transactions can be +detected and recovered after a process restart. +""" + +import json +from typing import Any, Dict, List + +from pyagfs import AGFSClient + +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + +# Journal root path (global, not behind VikingFS URI mapping) +_JOURNAL_ROOT = "/local/_system/transactions" + + +class TransactionJournal: + """Persists transaction records to AGFS for crash recovery. + + Journal files live at ``/local/_system/transactions/{tx_id}/journal.json``. + """ + + def __init__(self, agfs: AGFSClient): + self._agfs = agfs + + def _tx_dir(self, tx_id: str) -> str: + return f"{_JOURNAL_ROOT}/{tx_id}" + + def _journal_path(self, tx_id: str) -> str: + return f"{_JOURNAL_ROOT}/{tx_id}/journal.json" + + def _ensure_dir(self, path: str) -> None: + """Create directory, ignoring already-exists errors.""" + try: + self._agfs.mkdir(path) + except Exception as e: + logger.warning(f"[Journal] mkdir {path}: {e}") + + def write(self, data: Dict[str, Any]) -> None: + """Create a new journal entry for a transaction. + + Args: + data: Serialized transaction record (from TransactionRecord.to_journal()). + """ + tx_id = data["id"] + self._ensure_dir("/local/_system") + self._ensure_dir(_JOURNAL_ROOT) + self._ensure_dir(self._tx_dir(tx_id)) + payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8") + self._agfs.write(self._journal_path(tx_id), payload) + logger.info(f"[Journal] Written: {self._journal_path(tx_id)}") + + def update(self, data: Dict[str, Any]) -> None: + """Overwrite an existing journal entry. + + Args: + data: Updated serialized transaction record. + """ + tx_id = data["id"] + payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8") + self._agfs.write(self._journal_path(tx_id), payload) + + def read(self, tx_id: str) -> Dict[str, Any]: + """Read a journal entry. + + Args: + tx_id: Transaction ID. + + Returns: + Parsed journal data. + + Raises: + FileNotFoundError: If journal does not exist. + """ + content = self._agfs.cat(self._journal_path(tx_id)) + if isinstance(content, bytes): + content = content.decode("utf-8") + return json.loads(content) + + def delete(self, tx_id: str) -> None: + """Delete a transaction's journal directory. + + Args: + tx_id: Transaction ID. + """ + try: + self._agfs.rm(self._tx_dir(tx_id), recursive=True) + logger.debug(f"[Journal] Deleted journal for tx {tx_id}") + except Exception as e: + logger.warning(f"[Journal] Failed to delete journal for tx {tx_id}: {e}") + + def list_all(self) -> List[str]: + """List all transaction IDs that have journal entries. + + Returns: + List of transaction ID strings. + """ + try: + entries = self._agfs.ls(_JOURNAL_ROOT) + tx_ids = [] + if isinstance(entries, list): + for entry in entries: + name = entry.get("name", "") if isinstance(entry, dict) else str(entry) + if name and name not in (".", "..") and entry.get("isDir", True): + tx_ids.append(name) + return tx_ids + except Exception: + return [] diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index c4f959b6..d66bf4c5 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -5,10 +5,33 @@ Provides path-based locking mechanism to prevent concurrent directory operations. Lock protocol: viking://resources/.../.path.ovlock file exists = locked + +Lock files contain a fencing token in the format ``{tx_id}:{time_ns}:{lock_type}`` so that +stale locks (left by crashed processes) can be detected and removed. + +Two lock types: + POINT (P): Locks a specific directory for write/semantic operations. + Blocks if any ancestor holds a SUBTREE lock. + SUBTREE (S): Locks an entire directory subtree for rm/mv-source operations. + Blocks if any descendant holds any lock. + +Livelock prevention: after both parties write their lock files and detect a conflict, +the "later" one (larger (timestamp, tx_id)) backs off and retries. + +# TODO(multi-node): File-based locks only work correctly when all nodes share the +# same AGFS backend with strong read-write consistency. For multi-node deployments +# with replicated or partitioned storage, replace this implementation with a +# distributed lock backend (e.g. etcd txn+lease, ZooKeeper ephemeral nodes). +# The PathLock interface should be extracted to allow swappable backends. +# Key requirements for a distributed backend: +# - Atomic compare-and-set (to avoid write-write races on lock acquisition) +# - Session-bound leases (so crashed nodes auto-release without TTL polling) +# - Monotonically increasing fencing tokens (etcd revision works well) """ import asyncio -from typing import List, Optional +import time +from typing import Optional, Tuple from pyagfs import AGFSClient @@ -20,301 +43,495 @@ # Lock file name LOCK_FILE_NAME = ".path.ovlock" +# Lock type constants +LOCK_TYPE_POINT = "P" +LOCK_TYPE_SUBTREE = "S" + +# Default poll interval when waiting for a lock (seconds) +_POLL_INTERVAL = 0.2 + + +def _make_fencing_token(tx_id: str, lock_type: str = LOCK_TYPE_POINT) -> str: + """Create a fencing token for a transaction. + + Format: ``{tx_id}:{time_ns}:{lock_type}`` where time_ns is the current + wall-clock time in nanoseconds and lock_type is P or S. + + Args: + tx_id: Transaction ID + lock_type: Lock type, either LOCK_TYPE_POINT ("P") or LOCK_TYPE_SUBTREE ("S") + + Returns: + Fencing token string + """ + return f"{tx_id}:{time.time_ns()}:{lock_type}" + + +def _parse_fencing_token(token: str) -> Tuple[str, int, str]: + """Parse a fencing token into (tx_id, timestamp_ns, lock_type). + + Supports: + - New format: ``{tx_id}:{time_ns}:P`` or ``{tx_id}:{time_ns}:S`` + - Legacy format: ``{tx_id}:{time_ns}`` (defaults to POINT) + - Very legacy: plain tx_id (ts=0, defaults to POINT) + + Args: + token: Fencing token string + + Returns: + (tx_id, timestamp_ns, lock_type) — timestamp_ns is 0 for legacy tokens, + lock_type defaults to LOCK_TYPE_POINT for legacy tokens. + """ + # New format ends with ":P" or ":S" + if token.endswith(f":{LOCK_TYPE_POINT}") or token.endswith(f":{LOCK_TYPE_SUBTREE}"): + lock_type = token[-1] + rest = token[:-2] # strip ":{lock_type}" + idx = rest.rfind(":") + if idx >= 0: + tx_id_part = rest[:idx] + ts_part = rest[idx + 1 :] + try: + return tx_id_part, int(ts_part), lock_type + except ValueError: + pass + return rest, 0, lock_type + + # Legacy format: {tx_id}:{time_ns} + if ":" in token: + idx = token.rfind(":") + tx_id_part = token[:idx] + ts_part = token[idx + 1 :] + try: + return tx_id_part, int(ts_part), LOCK_TYPE_POINT + except ValueError: + pass + + return token, 0, LOCK_TYPE_POINT + class PathLock: """Path lock manager for transaction-based directory locking. Implements path-based locking using lock files (.path.ovlock) to prevent concurrent operations on the same directory tree. + + Two lock types: + POINT (P): Used for write and semantic processing operations. + SUBTREE (S): Used for rm and mv-source operations. """ - def __init__(self, agfs_client: AGFSClient): + def __init__(self, agfs_client: AGFSClient, lock_expire: float = 300.0): """Initialize path lock manager. Args: agfs_client: AGFS client for file system operations + lock_expire: Stale lock expiry threshold in seconds (default: 300s). + Locks held longer than this by a crashed process are force-released. """ self._agfs = agfs_client + self._lock_expire = lock_expire def _get_lock_path(self, path: str) -> str: - """Get lock file path for a directory. - - Args: - path: Directory path to lock - - Returns: - Lock file path (path/.path.ovlock) - """ - # Remove trailing slash if present + """Get lock file path for a directory.""" path = path.rstrip("/") return f"{path}/{LOCK_FILE_NAME}" def _get_parent_path(self, path: str) -> Optional[str]: - """Get parent directory path. - - Args: - path: Directory path - - Returns: - Parent directory path or None if at root - """ + """Get parent directory path.""" path = path.rstrip("/") if "/" not in path: return None parent = path.rsplit("/", 1)[0] return parent if parent else None - async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool: - """Check if path is locked by another transaction. - - Args: - lock_path: Lock file path - transaction_id: Current transaction ID - - Returns: - True if locked by another transaction, False otherwise - """ + def _read_token(self, lock_path: str) -> Optional[str]: + """Read fencing token from lock file, returning None if absent.""" try: content = self._agfs.cat(lock_path) if isinstance(content, bytes): - lock_owner = content.decode("utf-8").strip() - else: - lock_owner = str(content).strip() - return lock_owner != transaction_id + return content.decode("utf-8").strip() + return str(content).strip() except Exception: - # Lock file doesn't exist or can't be read - not locked + return None + + async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool: + """Check if path is locked by another transaction (any lock type).""" + token = self._read_token(lock_path) + if token is None: return False + lock_owner, _, _ = _parse_fencing_token(token) + return lock_owner != transaction_id - async def _create_lock_file(self, lock_path: str, transaction_id: str) -> None: - """Create lock file with transaction ID. - - Args: - lock_path: Lock file path - transaction_id: Transaction ID to write to lock file - """ - self._agfs.write(lock_path, transaction_id.encode("utf-8")) + async def _create_lock_file( + self, lock_path: str, transaction_id: str, lock_type: str = LOCK_TYPE_POINT + ) -> None: + """Create lock file with fencing token.""" + token = _make_fencing_token(transaction_id, lock_type) + self._agfs.write(lock_path, token.encode("utf-8")) async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool: - """Verify lock file is owned by current transaction. + """Verify lock file is owned by current transaction.""" + token = self._read_token(lock_path) + if token is None: + return False + lock_owner, _, _ = _parse_fencing_token(token) + return lock_owner == transaction_id + + async def _remove_lock_file(self, lock_path: str) -> None: + """Remove lock file.""" + try: + self._agfs.rm(lock_path) + except Exception: + pass + + def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool: + """Check if a lock file is stale (left by a crashed process). + + A lock is considered stale if: + - The lock file does not exist (already cleaned up) + - The lock file contains a legacy token (no timestamp) + - The lock has been held longer than ``expire_seconds`` Args: lock_path: Lock file path - transaction_id: Current transaction ID + expire_seconds: Lock expiry threshold in seconds (default: 5 minutes) Returns: - True if lock is owned by current transaction, False otherwise + True if the lock is stale, False if it is still fresh """ - try: - content = self._agfs.cat(lock_path) - if isinstance(content, bytes): - lock_owner = content.decode("utf-8").strip() - else: - lock_owner = str(content).strip() - return lock_owner == transaction_id - except Exception: - return False + token = self._read_token(lock_path) + if token is None: + return True # No file = stale + _, ts, _ = _parse_fencing_token(token) + if ts == 0: + return True # Legacy format = consider stale + age = (time.time_ns() - ts) / 1e9 + return age > expire_seconds + + async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> Optional[str]: + """Walk all ancestor directories and return the first SUBTREE lock held by another tx. - async def _remove_lock_file(self, lock_path: str) -> None: - """Remove lock file. + Args: + path: Starting directory path (its ancestors are checked, not itself) + exclude_tx_id: Transaction ID to exclude from conflict detection + + Returns: + Lock file path of the conflicting SUBTREE lock, or None if no conflict + """ + parent = self._get_parent_path(path) + while parent: + lock_path = self._get_lock_path(parent) + token = self._read_token(lock_path) + if token is not None: + owner_id, _, lock_type = _parse_fencing_token(token) + if owner_id != exclude_tx_id and lock_type == LOCK_TYPE_SUBTREE: + return lock_path + parent = self._get_parent_path(parent) + return None + + async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Optional[str]: + """Recursively scan all descendant directories for locks held by another tx. Args: - lock_path: Lock file path + path: Root directory path to scan (its own lock is NOT checked here) + exclude_tx_id: Transaction ID to exclude from conflict detection + + Returns: + Lock file path of the first conflicting lock found, or None if no conflict """ try: - self._agfs.rm(lock_path) - except Exception: - # Lock file might not exist, ignore - pass - - async def acquire_normal(self, path: str, transaction: TransactionRecord) -> bool: - """Acquire path lock for normal operations. + entries = self._agfs.ls(path) + if not isinstance(entries, list): + return None + for entry in entries: + if not isinstance(entry, dict): + continue + name = entry.get("name", "") + if not name or name in (".", ".."): + continue + if not entry.get("isDir", False): + continue + subdir = f"{path.rstrip('/')}/{name}" + subdir_lock = self._get_lock_path(subdir) + token = self._read_token(subdir_lock) + if token is not None: + owner_id, _, _ = _parse_fencing_token(token) + if owner_id != exclude_tx_id: + return subdir_lock + # Recurse into subdir + result = await self._scan_descendants_for_locks(subdir, exclude_tx_id) + if result: + return result + except Exception as e: + logger.warning(f"Failed to scan descendants of {path}: {e}") + return None - Lock acquisition flow for normal operations: - 1. Check if target directory exists - 2. Check if target directory is locked by another transaction - 3. Check if parent directory is locked by another transaction - 4. Create .path.ovlock file with transaction ID - 5. Check again if parent directory is locked by another transaction - 6. Read lock file to confirm it contains current transaction ID - 7. Return success if all checks pass + async def acquire_point( + self, path: str, transaction: TransactionRecord, timeout: float = 0.0 + ) -> bool: + """Acquire POINT lock for write/semantic-processing operations. + + A POINT lock is placed on a single directory. It conflicts with: + - Any lock (P or S) on the same directory by another transaction + - Any SUBTREE (S) lock on any ancestor directory + + Lock acquisition flow: + 1. Check target directory exists + 2. Check if target directory is locked by another transaction → wait/stale-remove + 3. Check if any ancestor holds a SUBTREE lock → wait/stale-remove + 4. Write POINT(P) lock file + 5. TOCTOU double-check: re-scan ancestors for SUBTREE locks + - Conflict found: compare (ts, tx_id); later one backs off and retries + 6. Verify lock ownership + 7. Return success Args: path: Directory path to lock transaction: Transaction record + timeout: Maximum time to wait for the lock in seconds. + 0 (default) = fail immediately if locked. + > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout. Returns: - True if lock acquired successfully, False otherwise + True if lock acquired successfully, False if timeout exceeded """ transaction_id = transaction.id lock_path = self._get_lock_path(path) - parent_path = self._get_parent_path(path) + deadline = asyncio.get_event_loop().time() + timeout - # Step 1: Check if target directory exists + # Step 1: Check target directory exists (once, before polling) try: self._agfs.stat(path) except Exception: - logger.warning(f"Directory does not exist: {path}") - return False - - # Step 2: Check if target directory is locked by another transaction - if await self._is_locked_by_other(lock_path, transaction_id): - logger.warning(f"Path already locked by another transaction: {path}") - return False - - # Step 3: Check if parent directory is locked by another transaction - if parent_path: - parent_lock_path = self._get_lock_path(parent_path) - if await self._is_locked_by_other(parent_lock_path, transaction_id): - logger.warning(f"Parent path locked by another transaction: {parent_path}") - return False - - # Step 4: Create lock file - try: - await self._create_lock_file(lock_path, transaction_id) - except Exception as e: - logger.error(f"Failed to create lock file: {e}") + logger.warning(f"[POINT] Directory does not exist: {path}") return False - # Step 5: Check again if parent directory is locked - if parent_path: - parent_lock_path = self._get_lock_path(parent_path) - if await self._is_locked_by_other(parent_lock_path, transaction_id): - logger.warning(f"Parent path locked after lock creation: {parent_path}") - await self._remove_lock_file(lock_path) + while True: + # Step 2: Check if target directory is locked by another transaction + if await self._is_locked_by_other(lock_path, transaction_id): + if self.is_lock_stale(lock_path, self._lock_expire): + logger.warning(f"[POINT] Removing stale lock: {lock_path}") + await self._remove_lock_file(lock_path) + continue + if asyncio.get_event_loop().time() >= deadline: + logger.warning(f"[POINT] Timeout waiting for lock on: {path}") + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Step 3: Check all ancestors for SUBTREE locks + ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id) + if ancestor_conflict: + if self.is_lock_stale(ancestor_conflict, self._lock_expire): + logger.warning( + f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" + ) + await self._remove_lock_file(ancestor_conflict) + continue + if asyncio.get_event_loop().time() >= deadline: + logger.warning( + f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Step 4: Write POINT lock file + try: + await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_POINT) + except Exception as e: + logger.error(f"[POINT] Failed to create lock file: {e}") return False - # Step 6: Verify lock ownership - if not await self._verify_lock_ownership(lock_path, transaction_id): - logger.error(f"Lock ownership verification failed: {path}") - return False - - # Step 7: Success - add lock to transaction - transaction.add_lock(lock_path) - logger.debug(f"Lock acquired: {lock_path}") - return True - - async def _collect_subdirectories(self, path: str) -> List[str]: - """Collect all subdirectory paths recursively. - - Args: - path: Root directory path - - Returns: - List of all subdirectory paths - """ - subdirs = [] - try: - entries = self._agfs.ls(path) - if isinstance(entries, list): - for entry in entries: - if isinstance(entry, dict) and entry.get("isDir"): - entry_path = entry.get("name", "") - if entry_path: - subdirs.append(entry_path) - # Recursively collect subdirectories - subdirs.extend(await self._collect_subdirectories(entry_path)) - except Exception as e: - logger.warning(f"Failed to list directory {path}: {e}") - - return subdirs + # Step 5: TOCTOU double-check ancestors for SUBTREE locks + backed_off = False + conflict_after = await self._check_ancestors_for_subtree(path, transaction_id) + if conflict_after: + their_token = self._read_token(conflict_after) + if their_token: + their_tx_id, their_ts, _ = _parse_fencing_token(their_token) + my_token = self._read_token(lock_path) + _, my_ts, _ = ( + _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT) + ) + # Later one (larger (ts, tx_id)) backs off + if (my_ts, transaction_id) > (their_ts, their_tx_id): + logger.debug(f"[POINT] Backing off (livelock guard) on {path}") + await self._remove_lock_file(lock_path) + backed_off = True + # Either: I backed off, or they will back off. + # In both cases restart the outer loop after a brief wait. + if asyncio.get_event_loop().time() >= deadline: + if not backed_off: + await self._remove_lock_file(lock_path) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Step 6: Verify lock ownership + if not await self._verify_lock_ownership(lock_path, transaction_id): + logger.debug(f"[POINT] Lock ownership verification failed: {path}") + if asyncio.get_event_loop().time() >= deadline: + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Success + transaction.add_lock(lock_path) + logger.debug(f"[POINT] Lock acquired: {lock_path}") + return True - async def acquire_rm( - self, path: str, transaction: TransactionRecord, max_parallel: int = 8 + async def acquire_subtree( + self, path: str, transaction: TransactionRecord, timeout: float = 0.0 ) -> bool: - """Acquire path lock for rm operation using bottom-up parallel locking. - - Lock acquisition flow for rm operations (parallel bottom-up mode): - 1. Collect all subdirectory paths recursively - 2. Sort by depth (deepest first) - 3. Create lock files in batches with limited parallelism - 4. Lock the target directory last - 5. If any lock fails, release all acquired locks in reverse order + """Acquire SUBTREE lock for rm/mv-source operations. + + A SUBTREE lock is placed on a single directory (the root of the subtree). + It conflicts with: + - Any lock (P or S) on the same directory by another transaction + - Any lock (P or S) on any descendant directory by another transaction + + Lock acquisition flow: + 1. Check target directory exists + 2. Check if target directory is locked by another transaction → wait/stale-remove + 3. Scan all descendants for any locks → wait/stale-remove + 4. Write SUBTREE(S) lock file (only one file, at the root path) + 5. TOCTOU double-check: re-scan descendants for any new locks + - Conflict found: compare (ts, tx_id); later one backs off and retries + 6. Verify lock ownership + 7. Return success Args: - path: Directory path to lock + path: Directory path to lock (root of the subtree) transaction: Transaction record - max_parallel: Maximum number of parallel lock operations + timeout: Maximum time to wait for the lock in seconds. + 0 (default) = fail immediately if locked. + > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout. Returns: - True if all locks acquired successfully, False otherwise + True if lock acquired successfully, False if timeout exceeded """ transaction_id = transaction.id lock_path = self._get_lock_path(path) - acquired_locks = [] + deadline = asyncio.get_event_loop().time() + timeout - # Step 1: Collect all subdirectories - subdirs = await self._collect_subdirectories(path) + # Step 1: Check target directory exists + try: + self._agfs.stat(path) + except Exception: + logger.warning(f"[SUBTREE] Directory does not exist: {path}") + return False - # Step 2: Sort by depth (deepest first) - subdirs.sort(key=lambda p: p.count("/"), reverse=True) + while True: + # Step 2: Check if target directory is locked by another transaction + if await self._is_locked_by_other(lock_path, transaction_id): + if self.is_lock_stale(lock_path, self._lock_expire): + logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}") + await self._remove_lock_file(lock_path) + continue + if asyncio.get_event_loop().time() >= deadline: + logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Step 3: Scan all descendants for any locks by other transactions + desc_conflict = await self._scan_descendants_for_locks(path, transaction_id) + if desc_conflict: + if self.is_lock_stale(desc_conflict, self._lock_expire): + logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}") + await self._remove_lock_file(desc_conflict) + continue + if asyncio.get_event_loop().time() >= deadline: + logger.warning( + f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Step 4: Write SUBTREE lock file (only one file) + try: + await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_SUBTREE) + except Exception as e: + logger.error(f"[SUBTREE] Failed to create lock file: {e}") + return False - # Step 3: Create lock files in batches - try: - # Lock subdirectories in batches - for i in range(0, len(subdirs), max_parallel): - batch = subdirs[i : i + max_parallel] - tasks = [] - for subdir in batch: - subdir_lock_path = self._get_lock_path(subdir) - tasks.append(self._create_lock_file(subdir_lock_path, transaction_id)) - - # Execute batch in parallel - await asyncio.gather(*tasks) - acquired_locks.extend([self._get_lock_path(s) for s in batch]) - - # Step 4: Lock target directory - await self._create_lock_file(lock_path, transaction_id) - acquired_locks.append(lock_path) - - # Add all locks to transaction - for lock in acquired_locks: - transaction.add_lock(lock) - - logger.debug(f"RM locks acquired for {len(acquired_locks)} paths") + # Step 5: TOCTOU double-check descendants + backed_off = False + conflict_after = await self._scan_descendants_for_locks(path, transaction_id) + if conflict_after: + their_token = self._read_token(conflict_after) + if their_token: + their_tx_id, their_ts, _ = _parse_fencing_token(their_token) + my_token = self._read_token(lock_path) + _, my_ts, _ = ( + _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE) + ) + # Later one (larger (ts, tx_id)) backs off + if (my_ts, transaction_id) > (their_ts, their_tx_id): + logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}") + await self._remove_lock_file(lock_path) + backed_off = True + # Either: I backed off, or they will back off. + # In both cases restart the outer loop after a brief wait. + if asyncio.get_event_loop().time() >= deadline: + if not backed_off: + await self._remove_lock_file(lock_path) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Step 6: Verify lock ownership + if not await self._verify_lock_ownership(lock_path, transaction_id): + logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}") + if asyncio.get_event_loop().time() >= deadline: + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + + # Success + transaction.add_lock(lock_path) + logger.debug(f"[SUBTREE] Lock acquired: {lock_path}") return True - except Exception as e: - logger.error(f"Failed to acquire RM locks: {e}") - # Step 5: Release all acquired locks in reverse order - for lock in reversed(acquired_locks): - await self._remove_lock_file(lock) - return False - async def acquire_mv( self, src_path: str, dst_path: str, transaction: TransactionRecord, - max_parallel: int = 8, + timeout: float = 0.0, ) -> bool: """Acquire path lock for mv operation. Lock acquisition flow for mv operations: - 1. Lock source directory (using RM-style locking) - 2. Lock destination directory (using normal locking) + 1. Acquire SUBTREE lock on source directory + 2. Acquire POINT lock on destination parent directory Args: src_path: Source directory path - dst_path: Destination directory path + dst_path: Destination parent directory path transaction: Transaction record - max_parallel: Maximum number of parallel lock operations + timeout: Maximum time to wait for each lock in seconds. + 0 (default) = fail immediately if locked. + > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout. Returns: True if all locks acquired successfully, False otherwise """ - # Step 1: Lock source directory - if not await self.acquire_rm(src_path, transaction, max_parallel): - logger.warning(f"Failed to lock source path: {src_path}") + # Step 1: Lock source directory with SUBTREE lock + if not await self.acquire_subtree(src_path, transaction, timeout=timeout): + logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}") return False - # Step 2: Lock destination directory - if not await self.acquire_normal(dst_path, transaction): - logger.warning(f"Failed to lock destination path: {dst_path}") - # Release source locks + # Step 2: Lock destination parent directory with POINT lock + if not await self.acquire_point(dst_path, transaction, timeout=timeout): + logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}") + # Release source lock await self.release(transaction) return False - logger.debug(f"MV locks acquired: {src_path} -> {dst_path}") + logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}") return True async def release(self, transaction: TransactionRecord) -> None: diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index 32a24c44..58d9d4df 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -9,7 +9,7 @@ import asyncio import threading import time -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional from pyagfs import AGFSClient @@ -35,6 +35,7 @@ class TransactionManager: - Allocating transaction IDs - Managing transaction lifecycle (start, commit, rollback) - Providing transaction lock mechanism interface, preventing deadlocks + - Persisting transaction state to journal for crash recovery """ def __init__( @@ -42,6 +43,8 @@ def __init__( agfs_client: AGFSClient, timeout: int = 3600, max_parallel_locks: int = 8, + lock_timeout: float = 0.0, + lock_expire: float = 300.0, ): """Initialize transaction manager. @@ -49,11 +52,19 @@ def __init__( agfs_client: AGFS client for file system operations timeout: Transaction timeout in seconds (default: 3600) max_parallel_locks: Maximum number of parallel lock operations (default: 8) + lock_timeout: Path lock acquisition timeout in seconds. + 0 (default) = fail immediately if locked. + > 0 = wait/retry up to this many seconds. + lock_expire: Stale lock expiry threshold in seconds (default: 300s). """ + from openviking.storage.transaction.journal import TransactionJournal + self._agfs = agfs_client self._timeout = timeout self._max_parallel_locks = max_parallel_locks - self._path_lock = PathLock(agfs_client) + self._lock_timeout = lock_timeout + self._path_lock = PathLock(agfs_client, lock_expire=lock_expire) + self._journal = TransactionJournal(agfs_client) # Active transactions: {transaction_id: TransactionRecord} self._transactions: Dict[str, TransactionRecord] = {} @@ -66,10 +77,15 @@ def __init__( f"TransactionManager initialized (timeout={timeout}s, max_parallel_locks={max_parallel_locks})" ) + @property + def journal(self): + return self._journal + async def start(self) -> None: """Start transaction manager. - Starts the background cleanup task for timed-out transactions. + Starts the background cleanup task and recovers any pending transactions + left from a previous process crash. """ if self._running: logger.debug("TransactionManager already running") @@ -77,6 +93,12 @@ async def start(self) -> None: self._running = True self._cleanup_task = asyncio.create_task(self._cleanup_loop()) + + # Recover any transactions that were interrupted by a previous crash. + # Journal entries are written BEFORE lock acquisition, so every orphan + # lock has a corresponding journal entry that recovery can use to clean it up. + await self._recover_pending_transactions() + logger.info("TransactionManager started") def stop(self) -> None: @@ -125,6 +147,121 @@ async def _cleanup_timed_out(self) -> None: logger.warning(f"Transaction timed out: {tx_id}") await self.rollback(tx_id) + async def _recover_pending_transactions(self) -> None: + """Recover pending transactions from journal after a crash. + + Reads all journal entries and rolls back any transactions that were + not cleanly committed or rolled back. + """ + try: + pending_ids = self._journal.list_all() + except Exception as e: + logger.warning(f"Failed to list journal entries for recovery: {e}") + return + + if not pending_ids: + return + + logger.info(f"Found {len(pending_ids)} pending transaction(s) to recover") + + for tx_id in pending_ids: + try: + await self._recover_one(tx_id) + except Exception as e: + logger.error(f"Failed to recover transaction {tx_id}: {e}") + + async def _recover_one(self, tx_id: str) -> None: + """Recover a single transaction from journal. + + Recovery strategy by status: + COMMITTED + post_actions → replay post_actions (enqueue etc.), then clean up + COMMITTED, no post_actions / RELEASED → just clean up + EXEC / FAIL / RELEASING → rollback completed+partial ops, then clean up + INIT / ACQUIRE → nothing executed yet, just clean up + """ + from openviking.storage.transaction.undo import execute_rollback + + try: + data = self._journal.read(tx_id) + except Exception as e: + logger.warning(f"Cannot read journal for tx {tx_id}: {e}") + return + + tx = TransactionRecord.from_journal(data) + logger.info(f"Recovering transaction {tx_id} (status={tx.status})") + + if tx.status == TransactionStatus.COMMIT: + # Transaction was committed — replay any unfinished post_actions + if tx.post_actions: + logger.info( + f"Replaying {len(tx.post_actions)} post_action(s) for committed tx {tx_id}" + ) + try: + await self._execute_post_actions(tx.post_actions) + except Exception as e: + logger.warning(f"Post-action replay failed for tx {tx_id}: {e}") + elif tx.status in (TransactionStatus.INIT, TransactionStatus.AQUIRE): + # Transaction never executed any operations — nothing to rollback. + # However, locks may have been created before the journal was updated + # with the actual locks list. Use init_info.lock_paths to find and + # clean up orphan lock files owned by this transaction. + logger.info(f"Transaction {tx_id} never executed, cleaning up orphan locks") + if not tx.locks: + await self._cleanup_orphan_locks_from_init_info(tx_id, tx.init_info) + else: + # EXEC / FAIL / RELEASING: process crashed mid-operation — rollback + # Pass recover_all=True so partial (completed=False) ops are also reversed, + # e.g. a directory mv that started but never finished still leaves residue. + try: + execute_rollback(tx.undo_log, self._agfs, recover_all=True) + except Exception as e: + logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}") + + # Release any lock files still present + await self._path_lock.release(tx) + + # Clean up journal + try: + self._journal.delete(tx_id) + except Exception: + pass + + logger.info(f"Recovered transaction {tx_id}") + + async def _cleanup_orphan_locks_from_init_info( + self, tx_id: str, init_info: Dict[str, Any] + ) -> None: + """Clean up orphan lock files using lock path hints from init_info. + + When a crash occurs between lock creation and journal update, the + journal's ``locks`` list is empty but ``init_info.lock_paths`` records + the paths that were intended to be locked. This method checks those + paths and removes any lock files still owned by this transaction. + """ + from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _parse_fencing_token + + lock_paths = init_info.get("lock_paths", []) + lock_mode = init_info.get("lock_mode", "point") + mv_dst_path = init_info.get("mv_dst_path") + + # Collect all candidate paths to check + paths_to_check = list(lock_paths) + if lock_mode == "mv" and mv_dst_path: + paths_to_check.append(mv_dst_path) + + for path in paths_to_check: + lock_file = f"{path.rstrip('/')}/{LOCK_FILE_NAME}" + try: + token = self._path_lock._read_token(lock_file) + if token is None: + continue + owner_id, _, _ = _parse_fencing_token(token) + if owner_id == tx_id: + await self._path_lock._remove_lock_file(lock_file) + logger.info(f"Removed orphan lock for tx {tx_id}: {lock_file}") + except Exception as e: + logger.warning(f"Failed to check orphan lock {lock_file}: {e}") + def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord: """Create a new transaction. @@ -171,6 +308,8 @@ async def begin(self, transaction_id: str) -> bool: async def commit(self, transaction_id: str) -> bool: """Commit a transaction. + Executes post-actions, releases all locks, and removes the journal entry. + Args: transaction_id: Transaction ID @@ -185,6 +324,16 @@ async def commit(self, transaction_id: str) -> bool: # Update status to COMMIT tx.update_status(TransactionStatus.COMMIT) + # Persist final committed state before releasing + try: + self._journal.update(tx.to_journal()) + except Exception: + pass + + # Execute post-actions (best-effort, errors are logged but don't fail commit) + if tx.post_actions: + await self._execute_post_actions(tx.post_actions) + # Release all locks tx.update_status(TransactionStatus.RELEASING) await self._path_lock.release(tx) @@ -195,18 +344,29 @@ async def commit(self, transaction_id: str) -> bool: # Remove from active transactions self._transactions.pop(transaction_id, None) + # Clean up journal entry (last step — lock is already released) + try: + self._journal.delete(transaction_id) + except Exception as e: + logger.warning(f"Failed to delete journal on commit for {transaction_id}: {e}") + logger.debug(f"Transaction committed: {transaction_id}") return True async def rollback(self, transaction_id: str) -> bool: """Rollback a transaction. + Executes undo log entries in reverse order, releases all locks, + and removes the journal entry. + Args: transaction_id: Transaction ID Returns: True if transaction rolled back successfully, False otherwise """ + from openviking.storage.transaction.undo import execute_rollback + tx = self.get_transaction(transaction_id) if not tx: logger.error(f"Transaction not found: {transaction_id}") @@ -215,6 +375,21 @@ async def rollback(self, transaction_id: str) -> bool: # Update status to FAIL tx.update_status(TransactionStatus.FAIL) + # Persist rollback state + try: + self._journal.update(tx.to_journal()) + except Exception: + pass + + # Execute undo log (best-effort) + if tx.undo_log: + try: + execute_rollback(tx.undo_log, self._agfs) + except Exception as e: + logger.warning( + f"Undo log execution failed during rollback of {transaction_id}: {e}" + ) + # Release all locks tx.update_status(TransactionStatus.RELEASING) await self._path_lock.release(tx) @@ -225,11 +400,57 @@ async def rollback(self, transaction_id: str) -> bool: # Remove from active transactions self._transactions.pop(transaction_id, None) + # Clean up journal entry (last step — lock is already released) + try: + self._journal.delete(transaction_id) + except Exception as e: + logger.warning(f"Failed to delete journal on rollback for {transaction_id}: {e}") + logger.debug(f"Transaction rolled back: {transaction_id}") return True - async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool: - """Acquire path lock for normal (non-rm/mv) operations. + async def _execute_post_actions(self, post_actions: List[Dict[str, Any]]) -> None: + """Execute post-commit actions. + + Post-actions are executed after a successful commit. Errors are logged + but do not affect the commit outcome. + + Args: + post_actions: List of post-action dicts with 'type' and 'params' keys + """ + for action in post_actions: + action_type = action.get("type", "") + params = action.get("params", {}) + try: + if action_type == "enqueue_semantic": + await self._post_enqueue_semantic(params) + else: + logger.warning(f"Unknown post-action type: {action_type}") + except Exception as e: + logger.warning(f"Post-action '{action_type}' failed: {e}") + + async def _post_enqueue_semantic(self, params: Dict[str, Any]) -> None: + """Execute enqueue_semantic post-action.""" + from openviking.storage.queuefs import get_queue_manager + from openviking.storage.queuefs.semantic_msg import SemanticMsg + + queue_manager = get_queue_manager() + if queue_manager is None: + logger.debug("No queue manager available, skipping enqueue_semantic post-action") + return + + uri = params.get("uri") + context_type = params.get("context_type", "resource") + account_id = params.get("account_id", "default") + if not uri: + return + + msg = SemanticMsg(uri=uri, context_type=context_type, account_id=account_id) + semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC) + await semantic_queue.enqueue(msg) + + async def acquire_lock_point(self, transaction_id: str, path: str) -> bool: + """Acquire POINT lock for write/semantic-processing operations. Args: transaction_id: Transaction ID @@ -244,7 +465,7 @@ async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool: return False tx.update_status(TransactionStatus.AQUIRE) - success = await self._path_lock.acquire_normal(path, tx) + success = await self._path_lock.acquire_point(path, tx, timeout=self._lock_timeout) if success: tx.update_status(TransactionStatus.EXEC) @@ -253,15 +474,15 @@ async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool: return success - async def acquire_lock_rm( - self, transaction_id: str, path: str, max_parallel: Optional[int] = None + async def acquire_lock_subtree( + self, transaction_id: str, path: str, timeout: Optional[float] = None ) -> bool: - """Acquire path lock for rm operation. + """Acquire SUBTREE lock for rm/mv-source operations. Args: transaction_id: Transaction ID - path: Directory path to lock - max_parallel: Maximum number of parallel lock operations (default: from config) + path: Directory path to lock (root of the subtree) + timeout: Maximum time to wait for the lock in seconds (default: from config) Returns: True if lock acquired successfully, False otherwise @@ -272,8 +493,8 @@ async def acquire_lock_rm( return False tx.update_status(TransactionStatus.AQUIRE) - parallel = max_parallel or self._max_parallel_locks - success = await self._path_lock.acquire_rm(path, tx, parallel) + effective_timeout = timeout if timeout is not None else self._lock_timeout + success = await self._path_lock.acquire_subtree(path, tx, timeout=effective_timeout) if success: tx.update_status(TransactionStatus.EXEC) @@ -287,15 +508,15 @@ async def acquire_lock_mv( transaction_id: str, src_path: str, dst_path: str, - max_parallel: Optional[int] = None, + timeout: Optional[float] = None, ) -> bool: """Acquire path lock for mv operation. Args: transaction_id: Transaction ID src_path: Source directory path - dst_path: Destination directory path - max_parallel: Maximum number of parallel lock operations (default: from config) + dst_path: Destination parent directory path + timeout: Maximum time to wait for each lock in seconds (default: from config) Returns: True if lock acquired successfully, False otherwise @@ -306,8 +527,10 @@ async def acquire_lock_mv( return False tx.update_status(TransactionStatus.AQUIRE) - parallel = max_parallel or self._max_parallel_locks - success = await self._path_lock.acquire_mv(src_path, dst_path, tx, parallel) + effective_timeout = timeout if timeout is not None else self._lock_timeout + success = await self._path_lock.acquire_mv( + src_path, dst_path, tx, timeout=effective_timeout + ) if success: tx.update_status(TransactionStatus.EXEC) @@ -337,6 +560,8 @@ def init_transaction_manager( agfs_config: Any, tx_timeout: int = 3600, max_parallel_locks: int = 8, + lock_timeout: float = 0.0, + lock_expire: float = 300.0, ) -> TransactionManager: """Initialize transaction manager singleton. @@ -344,6 +569,10 @@ def init_transaction_manager( agfs_config: AGFS configuration (url, timeout, etc.) tx_timeout: Transaction timeout in seconds (default: 3600) max_parallel_locks: Maximum number of parallel lock operations (default: 8) + lock_timeout: Path lock acquisition timeout in seconds. + 0 (default) = fail immediately if locked. + > 0 = wait/retry up to this many seconds. + lock_expire: Stale lock expiry threshold in seconds (default: 300s). Returns: TransactionManager instance @@ -367,6 +596,8 @@ def init_transaction_manager( agfs_client=agfs_client, timeout=tx_timeout, max_parallel_locks=max_parallel_locks, + lock_timeout=lock_timeout, + lock_expire=lock_expire, ) logger.info("TransactionManager initialized as singleton") diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py index fba6480b..c73775de 100644 --- a/openviking/storage/transaction/transaction_record.py +++ b/openviking/storage/transaction/transaction_record.py @@ -41,6 +41,8 @@ class TransactionRecord: status: Current transaction status init_info: Transaction initialization information rollback_info: Information for rollback operations + undo_log: List of undo entries for rollback + post_actions: Actions to execute after successful commit created_at: Creation timestamp (Unix timestamp in seconds) updated_at: Last update timestamp (Unix timestamp in seconds) """ @@ -50,44 +52,30 @@ class TransactionRecord: status: TransactionStatus = field(default=TransactionStatus.INIT) init_info: Dict[str, Any] = field(default_factory=dict) rollback_info: Dict[str, Any] = field(default_factory=dict) + undo_log: List[Any] = field(default_factory=list) + post_actions: List[Dict[str, Any]] = field(default_factory=list) created_at: float = field(default_factory=time.time) updated_at: float = field(default_factory=time.time) def update_status(self, status: TransactionStatus) -> None: - """Update transaction status and timestamp. - - Args: - status: New transaction statusudi - """ + """Update transaction status and timestamp.""" self.status = status self.updated_at = time.time() def add_lock(self, lock_path: str) -> None: - """Add a lock to the transaction. - - Args: - lock_path: Path to be locked - """ + """Add a lock to the transaction.""" if lock_path not in self.locks: self.locks.append(lock_path) self.updated_at = time.time() def remove_lock(self, lock_path: str) -> None: - """Remove a lock from the transaction. - - Args: - lock_path: Path to be unlocked - """ + """Remove a lock from the transaction.""" if lock_path in self.locks: self.locks.remove(lock_path) self.updated_at = time.time() def to_dict(self) -> Dict[str, Any]: - """Convert transaction record to dictionary. - - Returns: - Dictionary representation of the transaction record - """ + """Convert transaction record to dictionary.""" return { "id": self.id, "locks": self.locks, @@ -98,16 +86,45 @@ def to_dict(self) -> Dict[str, Any]: "updated_at": self.updated_at, } + def to_journal(self) -> Dict[str, Any]: + """Serialize to journal format (includes undo_log and post_actions).""" + from openviking.storage.transaction.undo import UndoEntry + + return { + "id": self.id, + "locks": self.locks, + "status": str(self.status), + "init_info": self.init_info, + "undo_log": [e.to_dict() if isinstance(e, UndoEntry) else e for e in self.undo_log], + "post_actions": self.post_actions, + "created_at": self.created_at, + "updated_at": self.updated_at, + } + @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord": - """Create transaction record from dictionary. + def from_journal(cls, data: Dict[str, Any]) -> "TransactionRecord": + """Restore from journal format.""" + from openviking.storage.transaction.undo import UndoEntry + + status_str = data.get("status", "INIT") + status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str + undo_log = [UndoEntry.from_dict(e) for e in data.get("undo_log", [])] - Args: - data: Dictionary representation of the transaction record + return cls( + id=data.get("id", str(uuid.uuid4())), + locks=data.get("locks", []), + status=status, + init_info=data.get("init_info", {}), + rollback_info={}, + undo_log=undo_log, + post_actions=data.get("post_actions", []), + created_at=data.get("created_at", time.time()), + updated_at=data.get("updated_at", time.time()), + ) - Returns: - TransactionRecord instance - """ + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord": + """Create transaction record from dictionary.""" status_str = data.get("status", "INIT") status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py new file mode 100644 index 00000000..d64d1619 --- /dev/null +++ b/openviking/storage/transaction/undo.py @@ -0,0 +1,147 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Undo log and rollback executor for transaction management. + +Records operations performed within a transaction so they can be reversed +on rollback. Each UndoEntry captures one atomic sub-operation. +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@dataclass +class UndoEntry: + """A single undo log entry representing one reversible sub-operation. + + Attributes: + sequence: Monotonically increasing index within the transaction. + op_type: Operation type (fs_mv, fs_rm, fs_mkdir, fs_write_new, + vectordb_upsert, vectordb_delete, vectordb_update_uri). + params: Parameters needed to reverse the operation. + completed: Whether the forward operation completed successfully. + """ + + sequence: int + op_type: str + params: Dict[str, Any] = field(default_factory=dict) + completed: bool = False + + def to_dict(self) -> Dict[str, Any]: + return { + "sequence": self.sequence, + "op_type": self.op_type, + "params": self.params, + "completed": self.completed, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "UndoEntry": + return cls( + sequence=data.get("sequence", 0), + op_type=data.get("op_type", ""), + params=data.get("params", {}), + completed=data.get("completed", False), + ) + + +def execute_rollback( + undo_log: List[UndoEntry], + agfs: Any, + vector_store: Optional[Any] = None, + ctx: Optional[Any] = None, + recover_all: bool = False, +) -> None: + """Execute rollback by reversing operations in reverse order. + + Best-effort: each step is wrapped in try-except so a single failure + does not prevent subsequent undo steps from running. + + Args: + undo_log: List of undo entries to process. + agfs: AGFS client for filesystem operations. + vector_store: Optional vector store client. + ctx: Optional request context. + recover_all: If True, also attempt to reverse entries that were not + marked completed (used during crash recovery to clean up partial + operations such as a directory mv that only half-finished). + """ + if recover_all: + entries = list(undo_log) + else: + entries = [e for e in undo_log if e.completed] + entries.sort(key=lambda e: e.sequence, reverse=True) + + for entry in entries: + try: + _rollback_entry(entry, agfs, vector_store, ctx) + logger.info(f"[Rollback] Reversed {entry.op_type} seq={entry.sequence}") + except Exception as e: + logger.warning( + f"[Rollback] Failed to reverse {entry.op_type} seq={entry.sequence}: {e}" + ) + + +def _rollback_entry( + entry: UndoEntry, + agfs: Any, + vector_store: Optional[Any], + ctx: Optional[Any], +) -> None: + """Dispatch rollback for a single undo entry.""" + from openviking_cli.utils import run_async + + op = entry.op_type + params = entry.params + + if op == "fs_mv": + agfs.mv(params["dst"], params["src"]) + + elif op == "fs_rm": + logger.debug("[Rollback] fs_rm is not reversible, skipping") + + elif op == "fs_mkdir": + try: + agfs.rm(params["uri"]) + except Exception: + pass + + elif op == "fs_write_new": + try: + agfs.rm(params["uri"], recursive=True) + except Exception: + pass + + elif op == "vectordb_upsert": + if vector_store: + record_id = params.get("record_id") + if record_id: + run_async(vector_store.delete([record_id])) + + elif op == "vectordb_delete": + if vector_store and ctx: + records_snapshot = params.get("records_snapshot", []) + for record in records_snapshot: + try: + run_async(vector_store.upsert(record)) + except Exception as e: + logger.warning(f"[Rollback] Failed to restore vector record: {e}") + + elif op == "vectordb_update_uri": + if vector_store and ctx: + run_async( + vector_store.update_uri_mapping( + ctx=ctx, + uri=params["new_uri"], + new_uri=params["old_uri"], + new_parent_uri=params.get("old_parent_uri", ""), + ) + ) + + else: + logger.warning(f"[Rollback] Unknown op_type: {op}") diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 5b5afbb6..200caef0 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -260,15 +260,61 @@ async def rm( This method is idempotent: deleting a non-existent file succeeds after cleaning up any orphan index records. + + Wrapped in a transaction: deletes VectorDB records first, then FS files. + On rollback, VectorDB records are restored from snapshot. """ + from openviking.storage.transaction import TransactionContext, get_transaction_manager + self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) target_uri = self._path_to_uri(path, ctx=ctx) uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) uris_to_delete.append(target_uri) - result = self.agfs.rm(path, recursive=recursive) - await self._delete_from_vector_store(uris_to_delete, ctx=ctx) - return result + + tx_manager = get_transaction_manager() + if not tx_manager: + # Fallback: no transaction support + result = self.agfs.rm(path, recursive=recursive) + await self._delete_from_vector_store(uris_to_delete, ctx=ctx) + return result + + # Check existence and determine lock strategy + try: + stat = self.agfs.stat(path) + is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False + except Exception: + # Path does not exist: clean up any orphan index records and return + await self._delete_from_vector_store(uris_to_delete, ctx=ctx) + logger.info(f"[VikingFS] rm target not found, cleaned orphan index: {uri}") + return {} + + if is_dir: + lock_paths = [path] + lock_mode = "subtree" + else: + parent = path.rsplit("/", 1)[0] if "/" in path else path + lock_paths = [parent] + lock_mode = "point" + + async with TransactionContext(tx_manager, "rm", lock_paths, lock_mode=lock_mode) as tx: + # Snapshot vector records for rollback + records_snapshot = await self._snapshot_vector_records(uris_to_delete, ctx=ctx) + + # Step 1: Delete from VectorDB first + seq_vdb = tx.record_undo( + "vectordb_delete", {"uris": uris_to_delete, "records_snapshot": records_snapshot} + ) + await self._delete_from_vector_store(uris_to_delete, ctx=ctx) + tx.mark_completed(seq_vdb) + + # Step 2: Delete from FS + seq_fs = tx.record_undo("fs_rm", {"uri": path, "recursive": recursive}) + result = self.agfs.rm(path, recursive=recursive) + tx.mark_completed(seq_fs) + + await tx.commit() + return result async def mv( self, @@ -276,7 +322,13 @@ async def mv( new_uri: str, ctx: Optional[RequestContext] = None, ) -> Dict[str, Any]: - """Move file/directory + recursively update vector index.""" + """Move file/directory + recursively update vector index. + + Wrapped in a transaction: performs FS mv first, then VectorDB URI update. + On rollback, the file is moved back and VectorDB mappings are restored. + """ + from openviking.storage.transaction import TransactionContext, get_transaction_manager + self._ensure_access(old_uri, ctx) self._ensure_access(new_uri, ctx) old_path = self._uri_to_path(old_uri, ctx=ctx) @@ -285,15 +337,61 @@ async def mv( uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) uris_to_move.append(target_uri) + tx_manager = get_transaction_manager() + if not tx_manager: + # Fallback: no transaction support + try: + result = self.agfs.mv(old_path, new_path) + await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) + return result + except AGFSHTTPError as e: + if e.status_code == 404: + await self._delete_from_vector_store(uris_to_move, ctx=ctx) + logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") + raise + + # Verify source exists before locking try: - result = self.agfs.mv(old_path, new_path) + self.agfs.stat(old_path) + except Exception: + raise FileNotFoundError(f"mv source not found: {old_uri}") + + # Lock source and destination's parent (dst doesn't exist yet) + dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path + + async with TransactionContext( + tx_manager, "mv", [old_path], lock_mode="mv", mv_dst_path=dst_parent + ) as tx: + # Step 1: FS move + seq_mv = tx.record_undo("fs_mv", {"src": old_path, "dst": new_path}) + try: + result = self.agfs.mv(old_path, new_path) + except AGFSHTTPError as e: + if e.status_code == 404: + await self._delete_from_vector_store(uris_to_move, ctx=ctx) + logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") + raise + tx.mark_completed(seq_mv) + + # Step 2: Update VectorDB URIs + old_uri_stripped = old_uri.rstrip("/") + old_parent_uri = ( + old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else "" + ) + seq_vdb = tx.record_undo( + "vectordb_update_uri", + { + "old_uri": old_uri, + "new_uri": new_uri, + "old_parent_uri": old_parent_uri, + "uris": uris_to_move, + }, + ) await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) + tx.mark_completed(seq_vdb) + + await tx.commit() return result - except AGFSHTTPError as e: - if e.status_code == 404: - await self._delete_from_vector_store(uris_to_move, ctx=ctx) - logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") - raise async def grep( self, @@ -1040,6 +1138,33 @@ def _infer_context_type(self, uri: str): # ========== Vector Sync Helper Methods ========== + async def _snapshot_vector_records( + self, uris: List[str], ctx: Optional[RequestContext] = None + ) -> List[Dict[str, Any]]: + """Snapshot vector records for the given URIs (for rollback). + + Queries VectorDB metadata (without embedding vectors) so that + records can be restored during rollback. + """ + vector_store = self._get_vector_store() + if not vector_store: + return [] + + real_ctx = self._ctx_or_default(ctx) + snapshots = [] + for uri in uris: + try: + records = await vector_store.get_context_by_uri( + account_id=real_ctx.account_id, + uri=uri, + limit=10, + ) + if records: + snapshots.extend(records) + except Exception as e: + logger.debug(f"[VikingFS] Failed to snapshot vector record for {uri}: {e}") + return snapshots + async def _collect_uris( self, path: str, recursive: bool, ctx: Optional[RequestContext] = None ) -> List[str]: diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index c0627467..d6d043d9 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -442,14 +442,22 @@ async def update_uri_mapping( new_uri: str, new_parent_uri: str, ) -> bool: + # A directory URI may have multiple records (e.g. L0 abstract + L1 overview), + # so fetch and update all of them. records = await self.filter( filter=And([Eq("uri", uri), Eq("account_id", ctx.account_id)]), - limit=1, + limit=100, ) - if not records or "id" not in records[0]: + if not records: return False - updated = {**records[0], "uri": new_uri, "parent_uri": new_parent_uri} - return bool(await self.upsert(updated)) + success = False + for record in records: + if "id" not in record: + continue + updated = {**record, "uri": new_uri, "parent_uri": new_parent_uri} + if await self.upsert(updated): + success = True + return success async def increment_active_count(self, ctx: RequestContext, uris: List[str]) -> int: updated = 0 diff --git a/openviking/utils/agfs_utils.py b/openviking/utils/agfs_utils.py index 9b3d2d57..2c4f5b3c 100644 --- a/openviking/utils/agfs_utils.py +++ b/openviking/utils/agfs_utils.py @@ -99,6 +99,10 @@ def mount_agfs_backend(agfs: Any, agfs_config: Any) -> None: local_dir = plugin_config["config"]["local_dir"] os.makedirs(local_dir, exist_ok=True) logger.debug(f"[AGFSUtils] Ensured local directory exists: {local_dir}") + # Ensure queuefs db_path parent directory exists before mounting + if plugin_name == "queuefs" and "db_path" in plugin_config.get("config", {}): + db_path = plugin_config["config"]["db_path"] + os.makedirs(os.path.dirname(db_path), exist_ok=True) try: agfs.unmount(mount_path) diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py index 8daf6a79..b8b4bfea 100644 --- a/openviking_cli/utils/config/storage_config.py +++ b/openviking_cli/utils/config/storage_config.py @@ -8,6 +8,7 @@ from openviking_cli.utils.logger import get_logger from .agfs_config import AGFSConfig +from .transaction_config import TransactionConfig from .vectordb_config import VectorDBBackendConfig logger = get_logger(__name__) @@ -25,6 +26,11 @@ class StorageConfig(BaseModel): agfs: AGFSConfig = Field(default_factory=lambda: AGFSConfig(), description="AGFS configuration") + transaction: TransactionConfig = Field( + default_factory=lambda: TransactionConfig(), + description="Transaction mechanism configuration", + ) + vectordb: VectorDBBackendConfig = Field( default_factory=lambda: VectorDBBackendConfig(), description="VectorDB backend configuration", diff --git a/openviking_cli/utils/config/transaction_config.py b/openviking_cli/utils/config/transaction_config.py new file mode 100644 index 00000000..fac8c2aa --- /dev/null +++ b/openviking_cli/utils/config/transaction_config.py @@ -0,0 +1,37 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel, Field + + +class TransactionConfig(BaseModel): + """Configuration for the transaction mechanism. + + By default, lock acquisition does not wait (``lock_timeout=0``): if a + conflicting lock is held the operation fails immediately with + ``LockAcquisitionError``. Set ``lock_timeout`` to a positive value to + allow the caller to block and retry for up to that many seconds. + """ + + lock_timeout: float = Field( + default=0.0, + description=( + "Path lock acquisition timeout (seconds). " + "0 = fail immediately if locked (default). " + "> 0 = wait/retry up to this many seconds before raising LockAcquisitionError." + ), + ) + + lock_expire: float = Field( + default=300.0, + description=( + "Stale lock expiry threshold (seconds). " + "Locks held longer than this by a crashed process are force-released." + ), + ) + + max_parallel_locks: int = Field( + default=8, + description="Maximum parallel lock operations during recursive rm/mv.", + ) + + model_config = {"extra": "forbid"} diff --git a/tests/transaction/__init__.py b/tests/transaction/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py new file mode 100644 index 00000000..db77bbdd --- /dev/null +++ b/tests/transaction/conftest.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Shared fixtures for transaction tests using real AGFS backend.""" + +import os +import shutil +import uuid + +import pytest + +from openviking.agfs_manager import AGFSManager +from openviking.utils.agfs_utils import create_agfs_client +from openviking_cli.utils.config.agfs_config import AGFSConfig + +AGFS_CONF = AGFSConfig( + path="/tmp/ov-tx-test", backend="local", port=1834, url="http://localhost:1834", timeout=10 +) + +# Clean slate before session starts +if os.path.exists(AGFS_CONF.path): + shutil.rmtree(AGFS_CONF.path) + + +@pytest.fixture(scope="session") +def agfs_manager(): + manager = AGFSManager(config=AGFS_CONF) + manager.start() + yield manager + manager.stop() + + +@pytest.fixture(scope="session") +def agfs_client(agfs_manager): + return create_agfs_client(AGFS_CONF) + + +def _mkdir_ok(agfs_client, path): + """Create directory, ignoring already-exists errors.""" + try: + agfs_client.mkdir(path) + except Exception: + pass # already exists + + +@pytest.fixture +def test_dir(agfs_client): + """每个测试独享隔离目录,自动清理。""" + path = f"/local/tx-tests/{uuid.uuid4().hex}" + _mkdir_ok(agfs_client, "/local") + _mkdir_ok(agfs_client, "/local/tx-tests") + _mkdir_ok(agfs_client, path) + yield path + try: + agfs_client.rm(path, recursive=True) + except Exception: + pass diff --git a/tests/transaction/test_concurrent_lock.py b/tests/transaction/test_concurrent_lock.py new file mode 100644 index 00000000..e98279e4 --- /dev/null +++ b/tests/transaction/test_concurrent_lock.py @@ -0,0 +1,103 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for concurrent lock acquisition using real AGFS backend.""" + +import asyncio +import uuid + +from openviking.storage.transaction.path_lock import PathLock +from openviking.storage.transaction.transaction_record import TransactionRecord + + +class TestConcurrentLock: + async def test_point_mutual_exclusion_same_path(self, agfs_client, test_dir): + """两个任务竞争同一路径的 POINT 锁,均最终成功(串行执行)。""" + lock = PathLock(agfs_client) + + results = {} + + async def holder(tx_id): + tx = TransactionRecord(id=tx_id) + ok = await lock.acquire_point(test_dir, tx, timeout=5.0) + if ok: + await asyncio.sleep(0.3) + await lock.release(tx) + results[tx_id] = ok + + await asyncio.gather( + holder("tx-conc-1"), + holder("tx-conc-2"), + ) + + # Both should eventually succeed (one waits for the other) + assert results["tx-conc-1"] is True + assert results["tx-conc-2"] is True + + async def test_subtree_blocks_concurrent_point_child(self, agfs_client, test_dir): + """SUBTREE on parent 持锁期间,子目录的 POINT 被阻塞,释放后成功。""" + child = f"{test_dir}/child-{uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + parent_acquired = asyncio.Event() + parent_released = asyncio.Event() + + child_result = {} + + async def parent_holder(): + tx = TransactionRecord(id="tx-sub-parent") + ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0) + assert ok is True + parent_acquired.set() + await asyncio.sleep(0.5) + await lock.release(tx) + parent_released.set() + + async def child_worker(): + await parent_acquired.wait() + tx = TransactionRecord(id="tx-sub-child") + ok = await lock.acquire_point(child, tx, timeout=5.0) + child_result["ok"] = ok + child_result["after_release"] = parent_released.is_set() + if ok: + await lock.release(tx) + + await asyncio.gather(parent_holder(), child_worker()) + + assert child_result["ok"] is True + # Child should succeed only after parent released + assert child_result["after_release"] is True + + async def test_point_child_blocks_concurrent_subtree_parent(self, agfs_client, test_dir): + """POINT on child 持锁期间,父目录的 SUBTREE 被阻塞,释放后成功。""" + child = f"{test_dir}/child-{uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + child_acquired = asyncio.Event() + child_released = asyncio.Event() + + parent_result = {} + + async def child_holder(): + tx = TransactionRecord(id="tx-rev-child") + ok = await lock.acquire_point(child, tx, timeout=5.0) + assert ok is True + child_acquired.set() + await asyncio.sleep(0.5) + await lock.release(tx) + child_released.set() + + async def parent_worker(): + await child_acquired.wait() + tx = TransactionRecord(id="tx-rev-parent") + ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0) + parent_result["ok"] = ok + parent_result["after_release"] = child_released.is_set() + if ok: + await lock.release(tx) + + await asyncio.gather(child_holder(), parent_worker()) + + assert parent_result["ok"] is True + assert parent_result["after_release"] is True diff --git a/tests/transaction/test_context_manager.py b/tests/transaction/test_context_manager.py new file mode 100644 index 00000000..f45a55cc --- /dev/null +++ b/tests/transaction/test_context_manager.py @@ -0,0 +1,224 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for TransactionContext.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.context_manager import TransactionContext +from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus + + +def _make_tx_manager(lock_succeeds=True): + """Create a mock TransactionManager with async methods.""" + tx_manager = MagicMock() + record = TransactionRecord(id="tx-test", status=TransactionStatus.INIT) + + tx_manager.create_transaction.return_value = record + tx_manager.acquire_lock_point = AsyncMock(return_value=lock_succeeds) + tx_manager.acquire_lock_subtree = AsyncMock(return_value=lock_succeeds) + tx_manager.acquire_lock_mv = AsyncMock(return_value=lock_succeeds) + tx_manager.commit = AsyncMock(return_value=True) + tx_manager.rollback = AsyncMock(return_value=True) + + journal = MagicMock() + tx_manager.journal = journal + + return tx_manager, record + + +class TestTransactionContextNormal: + async def test_commit_success(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx: + seq = tx.record_undo("fs_write_new", {"uri": "/path/file"}) + tx.mark_completed(seq) + await tx.commit() + + tx_manager.commit.assert_called_once_with("tx-test") + tx_manager.rollback.assert_not_called() + + async def test_rollback_on_exception(self): + tx_manager, record = _make_tx_manager() + + with pytest.raises(ValueError): + async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx: + seq = tx.record_undo("fs_write_new", {"uri": "/path/file"}) + tx.mark_completed(seq) + raise ValueError("something went wrong") + + tx_manager.rollback.assert_called_once_with("tx-test") + tx_manager.commit.assert_not_called() + + async def test_rollback_on_no_commit(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx: + tx.record_undo("fs_write_new", {"uri": "/path/file"}) + # Forgot to call tx.commit() + + tx_manager.rollback.assert_called_once_with("tx-test") + + async def test_lock_failure_raises(self): + tx_manager, record = _make_tx_manager(lock_succeeds=False) + + with pytest.raises(LockAcquisitionError): + async with TransactionContext(tx_manager, "test_op", ["/path"]) as _tx: + pass + + +class TestTransactionContextLockModes: + async def test_subtree_lock_mode(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "rm_op", ["/path"], lock_mode="subtree") as tx: + await tx.commit() + + tx_manager.acquire_lock_subtree.assert_called_once() + + async def test_mv_lock_mode(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext( + tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path="/dst" + ) as tx: + await tx.commit() + + tx_manager.acquire_lock_mv.assert_called_once_with("tx-test", "/src", "/dst") + + async def test_point_lock_mode(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "write_op", ["/path"], lock_mode="point") as tx: + await tx.commit() + + tx_manager.acquire_lock_point.assert_called_once() + + +class TestTransactionContextUndoLog: + async def test_undo_entries_tracked(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "test", ["/path"]) as tx: + s0 = tx.record_undo("fs_mkdir", {"uri": "/a"}) + s1 = tx.record_undo("fs_write_new", {"uri": "/a/f.txt"}) + tx.mark_completed(s0) + tx.mark_completed(s1) + await tx.commit() + + assert len(record.undo_log) == 2 + assert record.undo_log[0].completed is True + assert record.undo_log[1].completed is True + + +class TestTransactionContextPostActions: + async def test_post_actions_added(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "test", ["/path"]) as tx: + tx.add_post_action("enqueue_semantic", {"uri": "viking://test"}) + await tx.commit() + + assert len(record.post_actions) == 1 + assert record.post_actions[0]["type"] == "enqueue_semantic" + + +class TestTransactionContextEdgeCases: + async def test_commit_failure_raises_transaction_error(self): + """When TransactionManager.commit() returns False, TransactionError is raised.""" + from openviking.storage.errors import TransactionError + + tx_manager, record = _make_tx_manager() + tx_manager.commit = AsyncMock(return_value=False) + + with pytest.raises(TransactionError, match="Failed to commit"): + async with TransactionContext(tx_manager, "test", ["/path"]) as tx: + await tx.commit() + + async def test_mv_mode_missing_dst_raises(self): + """mv lock mode without mv_dst_path raises TransactionError.""" + from openviking.storage.errors import TransactionError + + tx_manager, record = _make_tx_manager() + + with pytest.raises(TransactionError, match="mv lock mode requires"): + async with TransactionContext( + tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path=None + ) as _tx: + pass + + async def test_mark_completed_nonexistent_sequence_is_noop(self): + """mark_completed with a sequence not in undo_log doesn't crash.""" + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "test", ["/path"]) as tx: + seq = tx.record_undo("fs_mkdir", {"uri": "/a"}) + tx.mark_completed(999) # Nonexistent sequence + # Original entry should remain unmarked + assert record.undo_log[0].completed is False + tx.mark_completed(seq) + assert record.undo_log[0].completed is True + await tx.commit() + + async def test_journal_update_failure_does_not_break_transaction(self): + """Journal update failures during record_undo/mark_completed are silently ignored.""" + tx_manager, record = _make_tx_manager() + tx_manager.journal.update.side_effect = Exception("disk full") + + # Should not raise despite journal failures + async with TransactionContext(tx_manager, "test", ["/path"]) as tx: + seq = tx.record_undo("fs_mkdir", {"uri": "/a"}) + tx.mark_completed(seq) + await tx.commit() + + assert len(record.undo_log) == 1 + assert record.undo_log[0].completed is True + + async def test_record_property_before_enter_raises(self): + """Accessing tx.record before __aenter__ raises TransactionError.""" + from openviking.storage.errors import TransactionError + + tx_manager, _ = _make_tx_manager() + ctx = TransactionContext(tx_manager, "test", ["/path"]) + + with pytest.raises(TransactionError, match="Transaction not started"): + _ = ctx.record + + async def test_multiple_undo_entries_sequence_increments(self): + tx_manager, record = _make_tx_manager() + + async with TransactionContext(tx_manager, "test", ["/path"]) as tx: + s0 = tx.record_undo("fs_mkdir", {"uri": "/a"}) + s1 = tx.record_undo("fs_write_new", {"uri": "/a/f"}) + s2 = tx.record_undo("fs_mv", {"src": "/a", "dst": "/b"}) + assert s0 == 0 + assert s1 == 1 + assert s2 == 2 + await tx.commit() + + async def test_multiple_lock_paths_point_mode(self): + """Multiple lock_paths in point mode: each path gets acquire_lock_point called.""" + tx_manager, record = _make_tx_manager() + + async with TransactionContext( + tx_manager, "multi", ["/path1", "/path2"], lock_mode="point" + ) as tx: + await tx.commit() + + assert tx_manager.acquire_lock_point.call_count == 2 + + async def test_subtree_multiple_paths_stops_on_first_failure(self): + """If acquiring subtree lock on first path fails, second path is not attempted.""" + tx_manager, record = _make_tx_manager(lock_succeeds=False) + + with pytest.raises(LockAcquisitionError): + async with TransactionContext( + tx_manager, "rm", ["/path1", "/path2"], lock_mode="subtree" + ) as _tx: + pass + + # Only called once (failed on first path) + assert tx_manager.acquire_lock_subtree.call_count == 1 diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py new file mode 100644 index 00000000..85384574 --- /dev/null +++ b/tests/transaction/test_crash_recovery.py @@ -0,0 +1,385 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Integration test: crash recovery from journal.""" + +import time +from unittest.mock import AsyncMock, MagicMock, patch + +from openviking.storage.transaction.transaction_manager import TransactionManager + + +class TestCrashRecovery: + def _make_manager(self, journal_entries=None): + """Create a TransactionManager with mocked AGFS and journal data.""" + agfs = MagicMock() + manager = TransactionManager(agfs_client=agfs, timeout=3600) + + if journal_entries: + manager._journal = MagicMock() + manager._journal.list_all.return_value = list(journal_entries.keys()) + manager._journal.read.side_effect = lambda tx_id: journal_entries[tx_id] + manager._journal.delete = MagicMock() + else: + manager._journal = MagicMock() + manager._journal.list_all.return_value = [] + + return manager, agfs + + async def test_recover_committed_with_post_actions(self): + """COMMIT + post_actions → replay post_actions, clean up.""" + entries = { + "tx-1": { + "id": "tx-1", + "status": "COMMIT", + "locks": ["/local/test/.path.ovlock"], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [ + { + "type": "enqueue_semantic", + "params": { + "uri": "viking://test", + "context_type": "resource", + "account_id": "acc", + }, + } + ], + } + } + manager, agfs = self._make_manager(entries) + + with patch( + "openviking.storage.transaction.transaction_manager.TransactionManager._execute_post_actions", + new_callable=AsyncMock, + ) as mock_post: + await manager._recover_pending_transactions() + + mock_post.assert_called_once() + agfs.rm.assert_called_once_with("/local/test/.path.ovlock") + manager._journal.delete.assert_called_once_with("tx-1") + + async def test_recover_committed_no_post_actions(self): + """COMMIT + no post_actions → just clean up, no rollback.""" + entries = { + "tx-2": { + "id": "tx-2", + "status": "COMMIT", + "locks": [], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [ + # Even if undo_log has entries, COMMIT should NOT rollback + { + "sequence": 0, + "op_type": "fs_mv", + "params": {"src": "/a", "dst": "/b"}, + "completed": True, + } + ], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + agfs.mv.assert_not_called() # No rollback for committed transactions + manager._journal.delete.assert_called_once_with("tx-2") + + async def test_recover_exec_triggers_rollback(self): + """EXEC status → execute rollback regardless of transaction age.""" + entries = { + "tx-3": { + "id": "tx-3", + "status": "EXEC", + "locks": ["/local/x/.path.ovlock"], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [ + { + "sequence": 0, + "op_type": "fs_mv", + "params": {"src": "/local/a", "dst": "/local/b"}, + "completed": True, + } + ], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + agfs.mv.assert_called_once_with("/local/b", "/local/a") + manager._journal.delete.assert_called_once_with("tx-3") + + async def test_recover_fail_triggers_rollback(self): + """FAIL status → execute rollback.""" + entries = { + "tx-fail": { + "id": "tx-fail", + "status": "FAIL", + "locks": [], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [ + { + "sequence": 0, + "op_type": "fs_mkdir", + "params": {"uri": "/local/newdir"}, + "completed": True, + } + ], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + agfs.rm.assert_called_once_with("/local/newdir") + manager._journal.delete.assert_called_once_with("tx-fail") + + async def test_recover_exec_recover_all_includes_incomplete(self): + """EXEC recovery uses recover_all=True: also reverses incomplete entries.""" + entries = { + "tx-partial": { + "id": "tx-partial", + "status": "EXEC", + "locks": [], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [ + { + "sequence": 0, + "op_type": "fs_mv", + "params": {"src": "/local/a", "dst": "/local/b"}, + "completed": False, # not completed, but recover_all=True should still reverse it + } + ], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + agfs.mv.assert_called_once_with("/local/b", "/local/a") + manager._journal.delete.assert_called_once_with("tx-partial") + + async def test_recover_init_just_cleans_up(self): + """INIT status → no rollback (nothing executed), just release locks and clean journal.""" + entries = { + "tx-4": { + "id": "tx-4", + "status": "INIT", + "locks": ["/local/y/.path.ovlock"], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + agfs.rm.assert_called_once_with("/local/y/.path.ovlock") + manager._journal.delete.assert_called_once_with("tx-4") + + async def test_recover_multiple_transactions(self): + """Multiple journals are all recovered.""" + entries = { + "tx-a": { + "id": "tx-a", + "status": "INIT", + "locks": [], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + }, + "tx-b": { + "id": "tx-b", + "status": "COMMIT", + "locks": [], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + }, + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + assert manager._journal.delete.call_count == 2 + + async def test_recover_init_empty_locks_cleans_orphan_via_init_info(self): + """INIT with empty locks but init_info.lock_paths → clean up orphan lock files.""" + entries = { + "tx-orphan": { + "id": "tx-orphan", + "status": "INIT", + "locks": [], # Empty: crash happened before journal recorded locks + "init_info": { + "operation": "rm", + "lock_paths": ["/local/orphan-dir"], + "lock_mode": "subtree", + }, + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + + # Simulate: the lock file exists and is owned by this transaction + from openviking.storage.transaction.path_lock import _make_fencing_token + + token = _make_fencing_token("tx-orphan", "S") + agfs.cat.return_value = token.encode("utf-8") + + await manager._recover_pending_transactions() + + # Should have removed the orphan lock file + agfs.rm.assert_called() + rm_paths = [call[0][0] for call in agfs.rm.call_args_list] + assert any(".path.ovlock" in p for p in rm_paths) + manager._journal.delete.assert_called_once_with("tx-orphan") + + async def test_recover_init_orphan_lock_owned_by_other_tx_not_removed(self): + """INIT with orphan lock path, but lock file owned by a different tx → not removed.""" + entries = { + "tx-innocent": { + "id": "tx-innocent", + "status": "INIT", + "locks": [], + "init_info": { + "operation": "rm", + "lock_paths": ["/local/shared-dir"], + "lock_mode": "subtree", + }, + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + + # Lock file owned by a different transaction + from openviking.storage.transaction.path_lock import _make_fencing_token + + token = _make_fencing_token("tx-OTHER-owner", "S") + agfs.cat.return_value = token.encode("utf-8") + + await manager._recover_pending_transactions() + + # rm should NOT be called for the lock file (only journal delete) + rm_calls = [call[0][0] for call in agfs.rm.call_args_list] if agfs.rm.called else [] + assert not any(".path.ovlock" in p for p in rm_calls) + manager._journal.delete.assert_called_once_with("tx-innocent") + + async def test_recover_aquire_status(self): + """AQUIRE status → same as INIT, clean up only.""" + entries = { + "tx-acq": { + "id": "tx-acq", + "status": "AQUIRE", + "locks": ["/local/z/.path.ovlock"], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + agfs.rm.assert_called_once_with("/local/z/.path.ovlock") + manager._journal.delete.assert_called_once_with("tx-acq") + + async def test_recover_releasing_status_triggers_rollback(self): + """RELEASING status → process crashed while releasing, rollback undo log.""" + entries = { + "tx-rel": { + "id": "tx-rel", + "status": "RELEASING", + "locks": ["/local/r/.path.ovlock"], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [ + { + "sequence": 0, + "op_type": "fs_mkdir", + "params": {"uri": "/local/tmpdir"}, + "completed": True, + } + ], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + await manager._recover_pending_transactions() + + # Should rollback the undo log + rm_paths = [call[0][0] for call in agfs.rm.call_args_list] + assert "/local/tmpdir" in rm_paths + manager._journal.delete.assert_called_once_with("tx-rel") + + async def test_recover_mv_orphan_locks_include_dst(self): + """INIT mv operation with init_info → check both lock_paths and mv_dst_path for orphan locks.""" + entries = { + "tx-mv-orphan": { + "id": "tx-mv-orphan", + "status": "INIT", + "locks": [], + "init_info": { + "operation": "mv", + "lock_paths": ["/local/src-dir"], + "lock_mode": "mv", + "mv_dst_path": "/local/dst-dir", + }, + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + } + } + manager, agfs = self._make_manager(entries) + + from openviking.storage.transaction.path_lock import _make_fencing_token + + token = _make_fencing_token("tx-mv-orphan", "P") + agfs.cat.return_value = token.encode("utf-8") + + await manager._recover_pending_transactions() + + # Should check both src and dst paths for orphan locks + cat_paths = [call[0][0] for call in agfs.cat.call_args_list] + assert any("src-dir" in p for p in cat_paths) + assert any("dst-dir" in p for p in cat_paths) + + async def test_recover_journal_read_failure_skips_gracefully(self): + """If reading a journal entry fails, skip that tx and continue with others.""" + agfs = MagicMock() + manager = TransactionManager(agfs_client=agfs, timeout=3600) + manager._journal = MagicMock() + manager._journal.list_all.return_value = ["tx-bad", "tx-good"] + + def read_side_effect(tx_id): + if tx_id == "tx-bad": + raise Exception("corrupted journal") + return { + "id": "tx-good", + "status": "INIT", + "locks": [], + "created_at": time.time(), + "updated_at": time.time(), + "undo_log": [], + "post_actions": [], + } + + manager._journal.read.side_effect = read_side_effect + manager._journal.delete = MagicMock() + + await manager._recover_pending_transactions() + + # tx-good should still be cleaned up + manager._journal.delete.assert_called_once_with("tx-good") diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py new file mode 100644 index 00000000..88b6b5d6 --- /dev/null +++ b/tests/transaction/test_e2e.py @@ -0,0 +1,238 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""End-to-end transaction tests using real AGFS backend. + +These tests exercise the full stack: TransactionContext → TransactionManager → +PathLock → Journal → AGFS, verifying the complete acquire → operate → commit/rollback +→ release → journal cleanup lifecycle. +""" + +import uuid + +import pytest + +from openviking.storage.transaction.context_manager import TransactionContext +from openviking.storage.transaction.journal import TransactionJournal +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME +from openviking.storage.transaction.transaction_manager import TransactionManager + + +@pytest.fixture +def tx_manager(agfs_client): + """Create a real TransactionManager backed by the test AGFS.""" + manager = TransactionManager( + agfs_client=agfs_client, + timeout=3600, + max_parallel_locks=8, + lock_timeout=5.0, + lock_expire=300.0, + ) + return manager + + +class TestE2ECommit: + async def test_full_commit_lifecycle(self, agfs_client, tx_manager, test_dir): + """Full lifecycle: context enter → record undo → commit → locks released → journal cleaned.""" + async with TransactionContext( + tx_manager, "test_write", [test_dir], lock_mode="point" + ) as tx: + # Lock should be acquired + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + token = agfs_client.cat(lock_path) + assert token is not None + + # Record some operations + seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/file.txt"}) + agfs_client.write(f"{test_dir}/file.txt", b"hello") + tx.mark_completed(seq) + + # Add post action + tx.add_post_action( + "enqueue_semantic", + {"uri": "viking://test", "context_type": "resource", "account_id": "default"}, + ) + + await tx.commit() + + # After commit: lock should be released + try: + agfs_client.cat(lock_path) + raise AssertionError("Lock file should be gone after commit") + except Exception: + pass # Expected + + # Transaction should be removed from manager + assert tx_manager.get_transaction(tx.record.id) is None + + async def test_commit_file_persists(self, agfs_client, tx_manager, test_dir): + """Files written inside a committed transaction persist.""" + file_path = f"{test_dir}/committed-file.txt" + + async with TransactionContext(tx_manager, "write_op", [test_dir], lock_mode="point") as tx: + seq = tx.record_undo("fs_write_new", {"uri": file_path}) + agfs_client.write(file_path, b"committed data") + tx.mark_completed(seq) + await tx.commit() + + content = agfs_client.cat(file_path) + assert content == b"committed data" + + +class TestE2ERollback: + async def test_explicit_exception_triggers_rollback(self, agfs_client, tx_manager, test_dir): + """Exception inside context → auto-rollback → undo operations reversed.""" + new_dir = f"{test_dir}/to-be-rolled-back-{uuid.uuid4().hex}" + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "failing_op", [test_dir], lock_mode="point" + ) as tx: + seq = tx.record_undo("fs_mkdir", {"uri": new_dir}) + agfs_client.mkdir(new_dir) + tx.mark_completed(seq) + + raise RuntimeError("simulated failure") + + # Directory should be removed by rollback + try: + agfs_client.stat(new_dir) + raise AssertionError("Directory should be removed by rollback") + except Exception: + pass + + # Lock should be released + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + try: + agfs_client.cat(lock_path) + raise AssertionError("Lock should be released after rollback") + except Exception: + pass + + async def test_no_commit_triggers_rollback(self, agfs_client, tx_manager, test_dir): + """Exiting context without calling commit() triggers auto-rollback.""" + new_dir = f"{test_dir}/forgot-commit-{uuid.uuid4().hex}" + + async with TransactionContext(tx_manager, "no_commit", [test_dir], lock_mode="point") as tx: + seq = tx.record_undo("fs_mkdir", {"uri": new_dir}) + agfs_client.mkdir(new_dir) + tx.mark_completed(seq) + # Intentionally not calling tx.commit() + + # Directory should be removed by rollback + try: + agfs_client.stat(new_dir) + raise AssertionError("Directory should be removed by rollback") + except Exception: + pass + + +class TestE2EMvLock: + async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_dir): + """mv lock mode acquires SUBTREE on source and POINT on destination.""" + src = f"{test_dir}/mv-src-{uuid.uuid4().hex}" + dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + async with TransactionContext( + tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst + ) as tx: + # Both lock files should exist + src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}") + dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}") + src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token + dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token + + assert ":S" in src_token_str # SUBTREE on source + assert ":P" in dst_token_str # POINT on destination + + await tx.commit() + + # Both locks released + for path in [f"{src}/{LOCK_FILE_NAME}", f"{dst}/{LOCK_FILE_NAME}"]: + try: + agfs_client.cat(path) + raise AssertionError(f"Lock {path} should be gone") + except Exception: + pass + + +class TestE2ESubtreeRollback: + async def test_subtree_lock_with_rollback(self, agfs_client, tx_manager, test_dir): + """Subtree lock + rollback: undo is executed and lock released.""" + target = f"{test_dir}/sub-rb-{uuid.uuid4().hex}" + agfs_client.mkdir(target) + + child = f"{target}/child-{uuid.uuid4().hex}" + + with pytest.raises(ValueError): + async with TransactionContext(tx_manager, "rm_op", [target], lock_mode="subtree") as tx: + seq = tx.record_undo("fs_mkdir", {"uri": child}) + agfs_client.mkdir(child) + tx.mark_completed(seq) + + raise ValueError("abort rm") + + # Child dir should be removed by rollback + try: + agfs_client.stat(child) + raise AssertionError("Child should be cleaned up") + except Exception: + pass + + # Lock released + try: + agfs_client.cat(f"{target}/{LOCK_FILE_NAME}") + raise AssertionError("Lock should be released") + except Exception: + pass + + +class TestE2EJournalCleanup: + async def test_journal_cleaned_after_commit(self, agfs_client, tx_manager, test_dir): + """After successful commit, the journal entry for the transaction is deleted.""" + journal = TransactionJournal(agfs_client) + + async with TransactionContext( + tx_manager, "journal_test", [test_dir], lock_mode="point" + ) as tx: + tx_id = tx.record.id + await tx.commit() + + # Journal should be cleaned up + all_ids = journal.list_all() + assert tx_id not in all_ids + + async def test_journal_cleaned_after_rollback(self, agfs_client, tx_manager, test_dir): + """After rollback, the journal entry is also cleaned up.""" + journal = TransactionJournal(agfs_client) + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "journal_rb", [test_dir], lock_mode="point" + ) as tx: + tx_id = tx.record.id + raise RuntimeError("force rollback") + + all_ids = journal.list_all() + assert tx_id not in all_ids + + +class TestE2ESequentialTransactions: + async def test_sequential_transactions_on_same_path(self, agfs_client, tx_manager, test_dir): + """Two sequential transactions on the same path both succeed.""" + for i in range(3): + async with TransactionContext( + tx_manager, f"seq_{i}", [test_dir], lock_mode="point" + ) as tx: + seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/f{i}.txt"}) + agfs_client.write(f"{test_dir}/f{i}.txt", f"data-{i}".encode()) + tx.mark_completed(seq) + await tx.commit() + + # All files should exist + for i in range(3): + content = agfs_client.cat(f"{test_dir}/f{i}.txt") + assert content == f"data-{i}".encode() + + assert tx_manager.get_transaction_count() == 0 diff --git a/tests/transaction/test_journal.py b/tests/transaction/test_journal.py new file mode 100644 index 00000000..57f1e483 --- /dev/null +++ b/tests/transaction/test_journal.py @@ -0,0 +1,215 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for transaction journal.""" + +import json +import uuid +from unittest.mock import MagicMock + +from openviking.storage.transaction.journal import TransactionJournal + + +class TestTransactionJournal: + def _make_journal(self) -> tuple: + agfs = MagicMock() + journal = TransactionJournal(agfs) + return journal, agfs + + def test_write_calls_agfs_write_with_correct_data(self): + journal, agfs = self._make_journal() + data = {"id": "tx-1", "status": "INIT", "locks": []} + + journal.write(data) + + # Should call agfs.write with the journal path and serialized data + agfs.write.assert_called_once() + path, payload = agfs.write.call_args[0] + assert "tx-1" in path + assert path.endswith("journal.json") + parsed = json.loads(payload.decode("utf-8")) + assert parsed["id"] == "tx-1" + assert parsed["status"] == "INIT" + + def test_write_ensures_directories_exist(self): + journal, agfs = self._make_journal() + data = {"id": "tx-1", "status": "INIT", "locks": []} + + journal.write(data) + + # Should call mkdir at least once (for parent dirs) + assert agfs.mkdir.called + + def test_update_overwrites(self): + journal, agfs = self._make_journal() + data = {"id": "tx-2", "status": "EXEC", "locks": []} + + journal.update(data) + + agfs.write.assert_called_once() + path, payload = agfs.write.call_args[0] + assert json.loads(payload.decode("utf-8"))["status"] == "EXEC" + + def test_read_parses_json(self): + journal, agfs = self._make_journal() + agfs.cat.return_value = json.dumps({"id": "tx-3", "status": "EXEC"}).encode("utf-8") + + result = journal.read("tx-3") + assert result["id"] == "tx-3" + assert result["status"] == "EXEC" + + def test_read_handles_string_response(self): + """Some AGFS backends may return str instead of bytes.""" + journal, agfs = self._make_journal() + agfs.cat.return_value = json.dumps({"id": "tx-str", "status": "INIT"}) + + result = journal.read("tx-str") + assert result["id"] == "tx-str" + + def test_delete_removes_directory(self): + journal, agfs = self._make_journal() + journal.delete("tx-4") + agfs.rm.assert_called_once() + path = agfs.rm.call_args[0][0] + assert "tx-4" in path + + def test_list_all_returns_tx_ids(self): + journal, agfs = self._make_journal() + agfs.ls.return_value = [ + {"name": "tx-a", "isDir": True}, + {"name": "tx-b", "isDir": True}, + {"name": ".", "isDir": True}, + ] + + result = journal.list_all() + assert "tx-a" in result + assert "tx-b" in result + assert "." not in result + + def test_list_all_filters_dotdot(self): + journal, agfs = self._make_journal() + agfs.ls.return_value = [ + {"name": "..", "isDir": True}, + {"name": "tx-real", "isDir": True}, + ] + + result = journal.list_all() + assert ".." not in result + assert "tx-real" in result + + def test_list_all_empty_on_error(self): + journal, agfs = self._make_journal() + agfs.ls.side_effect = Exception("not found") + + result = journal.list_all() + assert result == [] + + def test_delete_tolerates_missing(self): + journal, agfs = self._make_journal() + agfs.rm.side_effect = Exception("not found") + # Should not raise + journal.delete("tx-missing") + + def test_write_with_post_actions(self): + journal, agfs = self._make_journal() + data = { + "id": "tx-5", + "status": "COMMIT", + "locks": [], + "post_actions": [ + {"type": "enqueue_semantic", "params": {"uri": "viking://test"}}, + ], + } + journal.write(data) + path, payload = agfs.write.call_args[0] + parsed = json.loads(payload.decode("utf-8")) + assert len(parsed["post_actions"]) == 1 + assert parsed["post_actions"][0]["type"] == "enqueue_semantic" + + def test_write_with_undo_log(self): + journal, agfs = self._make_journal() + data = { + "id": "tx-6", + "status": "EXEC", + "locks": [], + "undo_log": [ + { + "sequence": 0, + "op_type": "fs_mv", + "params": {"src": "/a", "dst": "/b"}, + "completed": True, + }, + ], + } + journal.write(data) + _, payload = agfs.write.call_args[0] + parsed = json.loads(payload.decode("utf-8")) + assert len(parsed["undo_log"]) == 1 + assert parsed["undo_log"][0]["op_type"] == "fs_mv" + + +class TestTransactionJournalIntegration: + """Integration tests using real AGFS backend to verify persistence behavior.""" + + def test_write_read_roundtrip(self, agfs_client): + journal = TransactionJournal(agfs_client) + tx_id = f"tx-int-{uuid.uuid4().hex}" + data = {"id": tx_id, "status": "INIT", "locks": [], "undo_log": []} + + journal.write(data) + result = journal.read(tx_id) + + assert result["id"] == tx_id + assert result["status"] == "INIT" + + journal.delete(tx_id) + + def test_update_overwrites(self, agfs_client): + journal = TransactionJournal(agfs_client) + tx_id = f"tx-int-{uuid.uuid4().hex}" + + journal.write({"id": tx_id, "status": "INIT", "locks": []}) + journal.update({"id": tx_id, "status": "EXEC", "locks": []}) + + result = journal.read(tx_id) + assert result["status"] == "EXEC" + + journal.delete(tx_id) + + def test_delete_removes_journal(self, agfs_client): + journal = TransactionJournal(agfs_client) + tx_id = f"tx-int-{uuid.uuid4().hex}" + + journal.write({"id": tx_id, "status": "INIT", "locks": []}) + journal.delete(tx_id) + + try: + journal.read(tx_id) + raise AssertionError("Should have raised after deletion") + except Exception: + pass # Expected + + def test_list_all_returns_written_ids(self, agfs_client): + journal = TransactionJournal(agfs_client) + tx_id_a = f"tx-int-{uuid.uuid4().hex}" + tx_id_b = f"tx-int-{uuid.uuid4().hex}" + + journal.write({"id": tx_id_a, "status": "INIT", "locks": []}) + journal.write({"id": tx_id_b, "status": "INIT", "locks": []}) + + result = journal.list_all() + assert tx_id_a in result + assert tx_id_b in result + + journal.delete(tx_id_a) + journal.delete(tx_id_b) + + def test_list_all_empty_when_none(self, agfs_client): + """After cleanup, list_all should not include previously deleted entries.""" + journal = TransactionJournal(agfs_client) + tx_id = f"tx-int-{uuid.uuid4().hex}" + + journal.write({"id": tx_id, "status": "INIT", "locks": []}) + journal.delete(tx_id) + + result = journal.list_all() + assert tx_id not in result diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py new file mode 100644 index 00000000..e9af3fdc --- /dev/null +++ b/tests/transaction/test_path_lock.py @@ -0,0 +1,334 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for path lock with fencing tokens.""" + +import time +from unittest.mock import MagicMock + +from openviking.storage.transaction.path_lock import ( + LOCK_FILE_NAME, + LOCK_TYPE_POINT, + LOCK_TYPE_SUBTREE, + PathLock, + _make_fencing_token, + _parse_fencing_token, +) +from openviking.storage.transaction.transaction_record import TransactionRecord + + +class TestFencingToken: + def test_make_parse_roundtrip(self): + token = _make_fencing_token("tx-123") + tx_id, ts, lock_type = _parse_fencing_token(token) + assert tx_id == "tx-123" + assert ts > 0 + assert lock_type == LOCK_TYPE_POINT + + def test_make_parse_subtree_roundtrip(self): + token = _make_fencing_token("tx-456", LOCK_TYPE_SUBTREE) + tx_id, ts, lock_type = _parse_fencing_token(token) + assert tx_id == "tx-456" + assert ts > 0 + assert lock_type == LOCK_TYPE_SUBTREE + + def test_parse_legacy_format_two_part(self): + """Legacy two-part token "{tx_id}:{ts}" defaults to POINT.""" + tx_id, ts, lock_type = _parse_fencing_token("tx-old:1234567890") + assert tx_id == "tx-old" + assert ts == 1234567890 + assert lock_type == LOCK_TYPE_POINT + + def test_parse_legacy_format_plain(self): + """Plain tx_id (no colon) defaults to ts=0, lock_type=POINT.""" + tx_id, ts, lock_type = _parse_fencing_token("tx-bare") + assert tx_id == "tx-bare" + assert ts == 0 + assert lock_type == LOCK_TYPE_POINT + + def test_tokens_are_unique(self): + t1 = _make_fencing_token("tx-1") + time.sleep(0.001) + t2 = _make_fencing_token("tx-1") + assert t1 != t2 + + +class TestPathLockStale: + def test_is_lock_stale_no_file(self): + agfs = MagicMock() + agfs.cat.side_effect = Exception("not found") + lock = PathLock(agfs) + assert lock.is_lock_stale("/test/.path.ovlock") is True + + def test_is_lock_stale_legacy_token(self): + agfs = MagicMock() + agfs.cat.return_value = b"tx-old-format" + lock = PathLock(agfs) + assert lock.is_lock_stale("/test/.path.ovlock") is True + + def test_is_lock_stale_recent_token(self): + agfs = MagicMock() + token = _make_fencing_token("tx-1") + agfs.cat.return_value = token.encode("utf-8") + lock = PathLock(agfs) + assert lock.is_lock_stale("/test/.path.ovlock", expire_seconds=300.0) is False + + +class TestPathLockBehavior: + """Behavioral tests using real AGFS backend.""" + + async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + tx = TransactionRecord(id="tx-point-1") + + ok = await lock.acquire_point(test_dir, tx, timeout=3.0) + assert ok is True + + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + content = agfs_client.cat(lock_path) + token = content.decode("utf-8") if isinstance(content, bytes) else content + assert ":P" in token + assert "tx-point-1" in token + + await lock.release(tx) + + async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + tx = TransactionRecord(id="tx-subtree-1") + + ok = await lock.acquire_subtree(test_dir, tx, timeout=3.0) + assert ok is True + + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + content = agfs_client.cat(lock_path) + token = content.decode("utf-8") if isinstance(content, bytes) else content + assert ":S" in token + assert "tx-subtree-1" in token + + await lock.release(tx) + + async def test_acquire_point_dir_not_found(self, agfs_client): + lock = PathLock(agfs_client) + tx = TransactionRecord(id="tx-no-dir") + + ok = await lock.acquire_point("/local/nonexistent-path-xyz", tx, timeout=0.5) + assert ok is False + assert len(tx.locks) == 0 + + async def test_release_removes_lock_file(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + tx = TransactionRecord(id="tx-release-1") + + await lock.acquire_point(test_dir, tx, timeout=3.0) + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + await lock.release(tx) + + # Lock file should be gone + try: + agfs_client.cat(lock_path) + raise AssertionError("Lock file should have been removed") + except Exception: + pass # Expected: file not found + + async def test_sequential_acquire_works(self, agfs_client, test_dir): + lock = PathLock(agfs_client) + + tx1 = TransactionRecord(id="tx-seq-1") + ok1 = await lock.acquire_point(test_dir, tx1, timeout=3.0) + assert ok1 is True + + await lock.release(tx1) + + tx2 = TransactionRecord(id="tx-seq-2") + ok2 = await lock.acquire_point(test_dir, tx2, timeout=3.0) + assert ok2 is True + + await lock.release(tx2) + + async def test_point_blocked_by_ancestor_subtree(self, agfs_client, test_dir): + """POINT on child blocked while ancestor holds SUBTREE lock.""" + import uuid as _uuid + + child = f"{test_dir}/child-{_uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + tx_parent = TransactionRecord(id="tx-parent-subtree") + ok = await lock.acquire_subtree(test_dir, tx_parent, timeout=3.0) + assert ok is True + + tx_child = TransactionRecord(id="tx-child-point") + blocked = await lock.acquire_point(child, tx_child, timeout=0.5) + assert blocked is False + + await lock.release(tx_parent) + + async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir): + """SUBTREE on parent blocked while descendant holds POINT lock.""" + import uuid as _uuid + + child = f"{test_dir}/child-{_uuid.uuid4().hex}" + agfs_client.mkdir(child) + + lock = PathLock(agfs_client) + tx_child = TransactionRecord(id="tx-desc-point") + ok = await lock.acquire_point(child, tx_child, timeout=3.0) + assert ok is True + + tx_parent = TransactionRecord(id="tx-parent-sub") + blocked = await lock.acquire_subtree(test_dir, tx_parent, timeout=0.5) + assert blocked is False + + await lock.release(tx_child) + + async def test_acquire_mv_creates_subtree_and_point(self, agfs_client, test_dir): + """acquire_mv puts SUBTREE on src and POINT on dst.""" + import uuid as _uuid + + src = f"{test_dir}/src-{_uuid.uuid4().hex}" + dst = f"{test_dir}/dst-{_uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + lock = PathLock(agfs_client) + tx = TransactionRecord(id="tx-mv-1") + ok = await lock.acquire_mv(src, dst, tx, timeout=3.0) + assert ok is True + + src_token_bytes = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}") + src_token = ( + src_token_bytes.decode("utf-8") + if isinstance(src_token_bytes, bytes) + else src_token_bytes + ) + assert ":S" in src_token + + dst_token_bytes = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}") + dst_token = ( + dst_token_bytes.decode("utf-8") + if isinstance(dst_token_bytes, bytes) + else dst_token_bytes + ) + assert ":P" in dst_token + + await lock.release(tx) + + async def test_point_does_not_block_sibling_point(self, agfs_client, test_dir): + """POINT locks on different directories do not conflict.""" + import uuid as _uuid + + dir_a = f"{test_dir}/sibling-a-{_uuid.uuid4().hex}" + dir_b = f"{test_dir}/sibling-b-{_uuid.uuid4().hex}" + agfs_client.mkdir(dir_a) + agfs_client.mkdir(dir_b) + + lock = PathLock(agfs_client) + tx_a = TransactionRecord(id="tx-sib-a") + tx_b = TransactionRecord(id="tx-sib-b") + + ok_a = await lock.acquire_point(dir_a, tx_a, timeout=3.0) + ok_b = await lock.acquire_point(dir_b, tx_b, timeout=3.0) + + assert ok_a is True + assert ok_b is True + + await lock.release(tx_a) + await lock.release(tx_b) + + async def test_stale_lock_auto_removed_on_acquire(self, agfs_client, test_dir): + """A stale lock (expired fencing token) is auto-removed, allowing a new acquire.""" + import uuid as _uuid + + target = f"{test_dir}/stale-{_uuid.uuid4().hex}" + agfs_client.mkdir(target) + + lock_path = f"{target}/{LOCK_FILE_NAME}" + + # Write a lock file with a very old timestamp (simulate crashed process) + old_ts = time.time_ns() - int(600 * 1e9) # 600 seconds ago + stale_token = f"tx-dead:{old_ts}:{LOCK_TYPE_POINT}" + agfs_client.write(lock_path, stale_token.encode("utf-8")) + + # New transaction should succeed by auto-removing the stale lock + lock = PathLock(agfs_client, lock_expire=300.0) + tx = TransactionRecord(id="tx-new-owner") + ok = await lock.acquire_point(target, tx, timeout=2.0) + assert ok is True + + # Verify new lock is owned by our transaction + content = agfs_client.cat(lock_path) + token = content.decode("utf-8") if isinstance(content, bytes) else content + assert "tx-new-owner" in token + + await lock.release(tx) + + async def test_stale_subtree_ancestor_auto_removed(self, agfs_client, test_dir): + """A stale SUBTREE lock on ancestor is auto-removed when child acquires POINT.""" + import uuid as _uuid + + child = f"{test_dir}/child-stale-{_uuid.uuid4().hex}" + agfs_client.mkdir(child) + + # Write stale SUBTREE lock on parent + parent_lock = f"{test_dir}/{LOCK_FILE_NAME}" + old_ts = time.time_ns() - int(600 * 1e9) + stale_token = f"tx-dead-parent:{old_ts}:{LOCK_TYPE_SUBTREE}" + agfs_client.write(parent_lock, stale_token.encode("utf-8")) + + lock = PathLock(agfs_client, lock_expire=300.0) + tx = TransactionRecord(id="tx-child-new") + ok = await lock.acquire_point(child, tx, timeout=2.0) + assert ok is True + + await lock.release(tx) + # Clean up stale parent lock if still present + try: + agfs_client.rm(parent_lock) + except Exception: + pass + + async def test_point_same_path_no_wait_fails_immediately(self, agfs_client, test_dir): + """With timeout=0, a conflicting lock fails immediately.""" + import uuid as _uuid + + target = f"{test_dir}/nowait-{_uuid.uuid4().hex}" + agfs_client.mkdir(target) + + lock = PathLock(agfs_client) + tx1 = TransactionRecord(id="tx-hold") + ok1 = await lock.acquire_point(target, tx1, timeout=3.0) + assert ok1 is True + + # Second acquire with timeout=0 should fail immediately + tx2 = TransactionRecord(id="tx-blocked") + t0 = time.monotonic() + ok2 = await lock.acquire_point(target, tx2, timeout=0.0) + elapsed = time.monotonic() - t0 + + assert ok2 is False + assert elapsed < 1.0 # Should not wait + + await lock.release(tx1) + + async def test_subtree_same_path_mutual_exclusion(self, agfs_client, test_dir): + """Two SUBTREE locks on the same path: second one blocked until first releases.""" + import uuid as _uuid + + target = f"{test_dir}/sub-excl-{_uuid.uuid4().hex}" + agfs_client.mkdir(target) + + lock = PathLock(agfs_client) + tx1 = TransactionRecord(id="tx-sub1") + ok1 = await lock.acquire_subtree(target, tx1, timeout=3.0) + assert ok1 is True + + tx2 = TransactionRecord(id="tx-sub2") + ok2 = await lock.acquire_subtree(target, tx2, timeout=0.5) + assert ok2 is False + + await lock.release(tx1) + + # Now tx2 should succeed + ok2_retry = await lock.acquire_subtree(target, tx2, timeout=3.0) + assert ok2_retry is True + await lock.release(tx2) diff --git a/tests/transaction/test_post_actions.py b/tests/transaction/test_post_actions.py new file mode 100644 index 00000000..2ae3c12b --- /dev/null +++ b/tests/transaction/test_post_actions.py @@ -0,0 +1,112 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for post_actions execution and replay.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +from openviking.storage.transaction.transaction_manager import TransactionManager + + +class TestPostActions: + def _make_manager(self): + agfs = MagicMock() + manager = TransactionManager(agfs_client=agfs, timeout=3600) + manager._journal = MagicMock() + return manager, agfs + + async def test_execute_enqueue_semantic(self): + manager, _ = self._make_manager() + + mock_queue = AsyncMock() + mock_queue_manager = MagicMock() + mock_queue_manager.get_queue.return_value = mock_queue + + with patch( + "openviking.storage.queuefs.get_queue_manager", + return_value=mock_queue_manager, + ): + await manager._execute_post_actions( + [ + { + "type": "enqueue_semantic", + "params": { + "uri": "viking://resources/test", + "context_type": "resource", + "account_id": "acc-1", + }, + } + ] + ) + + mock_queue.enqueue.assert_called_once() + msg = mock_queue.enqueue.call_args[0][0] + assert msg.uri == "viking://resources/test" + assert msg.context_type == "resource" + assert msg.account_id == "acc-1" + + async def test_execute_unknown_action_logged(self): + manager, _ = self._make_manager() + # Should not raise, just log + await manager._execute_post_actions( + [ + {"type": "unknown_action", "params": {}}, + ] + ) + + async def test_execute_multiple_actions(self): + manager, _ = self._make_manager() + + mock_queue = AsyncMock() + mock_queue_manager = MagicMock() + mock_queue_manager.get_queue.return_value = mock_queue + + with patch( + "openviking.storage.queuefs.get_queue_manager", + return_value=mock_queue_manager, + ): + await manager._execute_post_actions( + [ + { + "type": "enqueue_semantic", + "params": { + "uri": "viking://a", + "context_type": "resource", + "account_id": "acc-1", + }, + }, + { + "type": "enqueue_semantic", + "params": { + "uri": "viking://b", + "context_type": "memory", + "account_id": "acc-2", + }, + }, + ] + ) + + assert mock_queue.enqueue.call_count == 2 + + async def test_post_action_failure_does_not_crash(self): + manager, _ = self._make_manager() + + mock_queue_manager = MagicMock() + mock_queue_manager.get_queue.side_effect = Exception("queue not available") + + with patch( + "openviking.storage.queuefs.get_queue_manager", + return_value=mock_queue_manager, + ): + # Should not raise + await manager._execute_post_actions( + [ + { + "type": "enqueue_semantic", + "params": { + "uri": "viking://test", + "context_type": "resource", + "account_id": "", + }, + }, + ] + ) diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py new file mode 100644 index 00000000..ee28b7e7 --- /dev/null +++ b/tests/transaction/test_rm_rollback.py @@ -0,0 +1,233 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Integration tests: multi-step rollback covering FS + VectorDB coordination.""" + +from unittest.mock import AsyncMock, MagicMock + +from openviking.storage.transaction.undo import UndoEntry, execute_rollback + + +class TestRmRollback: + def test_vectordb_records_restored_on_fs_failure(self): + """When FS rm fails (incomplete), VectorDB delete is rolled back via snapshot.""" + agfs = MagicMock() + vector_store = AsyncMock() + ctx = MagicMock() + + snapshot = [{"id": "r1", "uri": "viking://a", "content": "data"}] + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_delete", + params={"uris": ["viking://a"], "records_snapshot": snapshot}, + completed=True, # VectorDB delete succeeded + ), + UndoEntry( + sequence=1, + op_type="fs_rm", + params={"uri": "/local/test", "recursive": True}, + completed=False, # FS rm never ran + ), + ] + + execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + + # Only vectordb_delete (completed=True) is reversed + vector_store.upsert.assert_called_once_with(snapshot[0]) + # fs_rm is incomplete, so it's skipped (also fs_rm is never reversible anyway) + agfs.rm.assert_not_called() + + def test_fs_rm_not_reversible_even_when_completed(self): + """fs_rm is intentionally irreversible: even completed=True is skipped.""" + agfs = MagicMock() + undo_log = [ + UndoEntry( + sequence=0, + op_type="fs_rm", + params={"uri": "/local/test"}, + completed=True, + ), + ] + execute_rollback(undo_log, agfs) + agfs.rm.assert_not_called() + agfs.mv.assert_not_called() + + +class TestMvRollback: + def test_file_moved_back_on_vectordb_failure(self): + """When VectorDB update fails (incomplete), FS mv is reversed.""" + agfs = MagicMock() + + undo_log = [ + UndoEntry( + sequence=0, + op_type="fs_mv", + params={"src": "/local/a", "dst": "/local/b"}, + completed=True, # FS mv succeeded + ), + UndoEntry( + sequence=1, + op_type="vectordb_update_uri", + params={ + "old_uri": "viking://a", + "new_uri": "viking://b", + "old_parent_uri": "viking://", + }, + completed=False, # VectorDB update never ran + ), + ] + + execute_rollback(undo_log, agfs) + + # Only fs_mv (completed=True) is reversed + agfs.mv.assert_called_once_with("/local/b", "/local/a") + + +class TestRecoverAll: + def test_recover_all_reverses_incomplete_entries(self): + """recover_all=True (crash recovery mode) also reverses incomplete entries.""" + agfs = MagicMock() + + undo_log = [ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": "/local/newdir"}, + completed=True, + ), + UndoEntry( + sequence=1, + op_type="fs_mv", + params={"src": "/local/a", "dst": "/local/b"}, + completed=False, # Crash happened mid-operation + ), + ] + + execute_rollback(undo_log, agfs, recover_all=True) + + # Both entries should be reversed (in reverse sequence order) + assert agfs.mv.call_count == 1 + agfs.mv.assert_called_once_with("/local/b", "/local/a") + agfs.rm.assert_called_once_with("/local/newdir") + + def test_recover_all_false_skips_incomplete(self): + """recover_all=False (normal rollback) skips incomplete entries.""" + agfs = MagicMock() + + undo_log = [ + UndoEntry( + sequence=0, + op_type="fs_mv", + params={"src": "/local/a", "dst": "/local/b"}, + completed=False, + ), + ] + + execute_rollback(undo_log, agfs, recover_all=False) + agfs.mv.assert_not_called() + + +class TestVectorDBRollbackEdgeCases: + def test_multi_record_vectordb_delete_rollback(self): + """Multiple VectorDB records in snapshot should all be restored.""" + agfs = MagicMock() + vector_store = AsyncMock() + ctx = MagicMock() + + snapshot = [ + {"id": "r1", "uri": "viking://a", "content": "data1"}, + {"id": "r2", "uri": "viking://b", "content": "data2"}, + {"id": "r3", "uri": "viking://c", "content": "data3"}, + ] + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_delete", + params={ + "uris": ["viking://a", "viking://b", "viking://c"], + "records_snapshot": snapshot, + }, + completed=True, + ), + ] + execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + + assert vector_store.upsert.call_count == 3 + + def test_empty_snapshot_vectordb_delete_rollback(self): + """Empty snapshot → nothing to restore, no error.""" + agfs = MagicMock() + vector_store = AsyncMock() + ctx = MagicMock() + + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_delete", + params={"uris": [], "records_snapshot": []}, + completed=True, + ), + ] + execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + vector_store.upsert.assert_not_called() + + def test_vectordb_delete_partial_restore_failure(self): + """If restoring one record fails, others should still be attempted.""" + agfs = MagicMock() + vector_store = AsyncMock() + ctx = MagicMock() + + call_count = 0 + + async def upsert_side_effect(record): + nonlocal call_count + call_count += 1 + if record["id"] == "r2": + raise Exception("upsert failed") + + vector_store.upsert = AsyncMock(side_effect=upsert_side_effect) + + snapshot = [ + {"id": "r1", "uri": "viking://a"}, + {"id": "r2", "uri": "viking://b"}, # This one fails + {"id": "r3", "uri": "viking://c"}, + ] + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_delete", + params={"records_snapshot": snapshot}, + completed=True, + ), + ] + execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + + # All 3 should be attempted (best-effort per record) + assert call_count == 3 + + def test_vectordb_upsert_rollback_without_vector_store_is_noop(self): + """vectordb_upsert rollback without vector_store does nothing.""" + agfs = MagicMock() + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_upsert", + params={"record_id": "r1"}, + completed=True, + ), + ] + # Should not raise + execute_rollback(undo_log, agfs, vector_store=None) + + def test_unknown_op_type_does_not_crash(self): + """Unknown op_type is logged but doesn't raise.""" + agfs = MagicMock() + undo_log = [ + UndoEntry( + sequence=0, + op_type="some_future_op", + params={"foo": "bar"}, + completed=True, + ), + ] + execute_rollback(undo_log, agfs) diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py new file mode 100644 index 00000000..ab9d5256 --- /dev/null +++ b/tests/transaction/test_transaction_manager.py @@ -0,0 +1,323 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for TransactionManager: CRUD, lifecycle, commit/rollback flows, timeout cleanup.""" + +import time +from unittest.mock import AsyncMock, MagicMock, patch + +from openviking.storage.transaction.transaction_manager import TransactionManager +from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus + + +def _make_manager(**kwargs): + """Create a TransactionManager with mocked AGFS and journal.""" + agfs = MagicMock() + defaults = {"agfs_client": agfs, "timeout": 3600, "lock_timeout": 0.0, "lock_expire": 300.0} + defaults.update(kwargs) + manager = TransactionManager(**defaults) + manager._journal = MagicMock() + manager._journal.list_all.return_value = [] + return manager, agfs + + +class TestCreateAndGet: + def test_create_transaction_returns_record(self): + manager, _ = _make_manager() + tx = manager.create_transaction(init_info={"operation": "rm"}) + assert isinstance(tx, TransactionRecord) + assert tx.status == TransactionStatus.INIT + assert tx.init_info == {"operation": "rm"} + + def test_create_assigns_unique_ids(self): + manager, _ = _make_manager() + tx1 = manager.create_transaction() + tx2 = manager.create_transaction() + assert tx1.id != tx2.id + + def test_get_transaction_found(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + assert manager.get_transaction(tx.id) is tx + + def test_get_transaction_not_found(self): + manager, _ = _make_manager() + assert manager.get_transaction("nonexistent") is None + + def test_get_transaction_count(self): + manager, _ = _make_manager() + assert manager.get_transaction_count() == 0 + manager.create_transaction() + assert manager.get_transaction_count() == 1 + manager.create_transaction() + assert manager.get_transaction_count() == 2 + + def test_get_active_transactions(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + active = manager.get_active_transactions() + assert tx.id in active + # Returned copy, not the internal dict + active.pop(tx.id) + assert manager.get_transaction(tx.id) is tx + + +class TestBegin: + async def test_begin_updates_status(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + ok = await manager.begin(tx.id) + assert ok is True + assert tx.status == TransactionStatus.AQUIRE + + async def test_begin_unknown_tx(self): + manager, _ = _make_manager() + ok = await manager.begin("unknown-tx") + assert ok is False + + +class TestCommitFlow: + async def test_commit_full_lifecycle(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + + # Simulate lock acquisition + tx.update_status(TransactionStatus.EXEC) + tx.add_lock("/test/.path.ovlock") + + ok = await manager.commit(tx.id) + assert ok is True + assert tx.status == TransactionStatus.RELEASED + # Removed from active transactions + assert manager.get_transaction(tx.id) is None + # Journal cleaned up + manager._journal.delete.assert_called_once_with(tx.id) + + async def test_commit_persists_journal_before_release(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + + call_order = [] + original_update = manager._journal.update + + def track_update(data): + call_order.append(("journal_update", data.get("status"))) + return original_update(data) + + manager._journal.update = track_update + manager._journal.delete = MagicMock( + side_effect=lambda _: call_order.append(("journal_delete",)) + ) + + await manager.commit(tx.id) + # Journal update (COMMIT) happens before delete + assert call_order[0] == ("journal_update", "COMMIT") + + async def test_commit_executes_post_actions(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + tx.post_actions.append({"type": "enqueue_semantic", "params": {"uri": "viking://x"}}) + + with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post: + await manager.commit(tx.id) + mock_post.assert_called_once() + + async def test_commit_unknown_tx(self): + manager, _ = _make_manager() + ok = await manager.commit("nonexistent") + assert ok is False + + async def test_commit_releases_locks(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + tx.add_lock("/a/.path.ovlock") + tx.add_lock("/b/.path.ovlock") + + with patch.object(manager._path_lock, "release", new_callable=AsyncMock) as mock_release: + await manager.commit(tx.id) + mock_release.assert_called_once() + + +class TestRollbackFlow: + async def test_rollback_executes_undo_log(self): + manager, agfs = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + + from openviking.storage.transaction.undo import UndoEntry + + tx.undo_log.append( + UndoEntry( + sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + ) + ) + + ok = await manager.rollback(tx.id) + assert ok is True + assert tx.status == TransactionStatus.RELEASED + agfs.mv.assert_called_once_with("/b", "/a") + + async def test_rollback_removes_from_active(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + + await manager.rollback(tx.id) + assert manager.get_transaction(tx.id) is None + + async def test_rollback_cleans_journal(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + + await manager.rollback(tx.id) + manager._journal.delete.assert_called_once_with(tx.id) + + async def test_rollback_unknown_tx(self): + manager, _ = _make_manager() + ok = await manager.rollback("nonexistent") + assert ok is False + + async def test_rollback_undo_failure_does_not_prevent_cleanup(self): + """Undo failure is best-effort; lock release and journal cleanup still happen.""" + manager, agfs = _make_manager() + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + + from openviking.storage.transaction.undo import UndoEntry + + tx.undo_log.append( + UndoEntry( + sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + ) + ) + agfs.mv.side_effect = Exception("disk error") + + ok = await manager.rollback(tx.id) + assert ok is True + manager._journal.delete.assert_called_once() + + +class TestLockAcquisitionWrappers: + async def test_acquire_lock_point_success_transitions_to_exec(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + + with patch.object( + manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=True + ): + ok = await manager.acquire_lock_point(tx.id, "/test") + assert ok is True + assert tx.status == TransactionStatus.EXEC + + async def test_acquire_lock_point_failure_transitions_to_fail(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + + with patch.object( + manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=False + ): + ok = await manager.acquire_lock_point(tx.id, "/test") + assert ok is False + assert tx.status == TransactionStatus.FAIL + + async def test_acquire_lock_subtree_success(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + + with patch.object( + manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True + ): + ok = await manager.acquire_lock_subtree(tx.id, "/test") + assert ok is True + assert tx.status == TransactionStatus.EXEC + + async def test_acquire_lock_subtree_uses_config_timeout(self): + manager, _ = _make_manager(lock_timeout=5.0) + tx = manager.create_transaction() + + with patch.object( + manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True + ) as mock_acquire: + await manager.acquire_lock_subtree(tx.id, "/test") + mock_acquire.assert_called_once_with("/test", tx, timeout=5.0) + + async def test_acquire_lock_subtree_override_timeout(self): + manager, _ = _make_manager(lock_timeout=5.0) + tx = manager.create_transaction() + + with patch.object( + manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True + ) as mock_acquire: + await manager.acquire_lock_subtree(tx.id, "/test", timeout=10.0) + mock_acquire.assert_called_once_with("/test", tx, timeout=10.0) + + async def test_acquire_lock_mv_success(self): + manager, _ = _make_manager() + tx = manager.create_transaction() + + with patch.object( + manager._path_lock, "acquire_mv", new_callable=AsyncMock, return_value=True + ): + ok = await manager.acquire_lock_mv(tx.id, "/src", "/dst") + assert ok is True + assert tx.status == TransactionStatus.EXEC + + async def test_acquire_lock_unknown_tx(self): + manager, _ = _make_manager() + ok = await manager.acquire_lock_point("nonexistent", "/test") + assert ok is False + + +class TestLifecycle: + async def test_start_sets_running(self): + manager, _ = _make_manager() + await manager.start() + assert manager._running is True + manager.stop() + + async def test_start_idempotent(self): + manager, _ = _make_manager() + await manager.start() + await manager.start() # Should not error + assert manager._running is True + manager.stop() + + async def test_stop_clears_state(self): + manager, _ = _make_manager() + await manager.start() + manager.create_transaction() + manager.stop() + assert manager._running is False + assert manager.get_transaction_count() == 0 + + async def test_stop_idempotent(self): + manager, _ = _make_manager() + manager.stop() + manager.stop() # Should not error + + +class TestTimeoutCleanup: + async def test_cleanup_timed_out_rolls_back(self): + manager, _ = _make_manager(timeout=1) + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + # Simulate old updated_at + tx.updated_at = time.time() - 10 + + with patch.object( + manager, "rollback", new_callable=AsyncMock, return_value=True + ) as mock_rb: + await manager._cleanup_timed_out() + mock_rb.assert_called_once_with(tx.id) + + async def test_cleanup_skips_fresh_transactions(self): + manager, _ = _make_manager(timeout=3600) + tx = manager.create_transaction() + tx.update_status(TransactionStatus.EXEC) + + with patch.object(manager, "rollback", new_callable=AsyncMock) as mock_rb: + await manager._cleanup_timed_out() + mock_rb.assert_not_called() diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py new file mode 100644 index 00000000..d67063d1 --- /dev/null +++ b/tests/transaction/test_undo.py @@ -0,0 +1,163 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for undo log and rollback executor.""" + +from unittest.mock import AsyncMock, MagicMock + +from openviking.storage.transaction.undo import UndoEntry, execute_rollback + + +class TestUndoEntry: + def test_to_dict(self): + entry = UndoEntry(sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}) + d = entry.to_dict() + assert d["sequence"] == 0 + assert d["op_type"] == "fs_mv" + assert d["params"] == {"src": "/a", "dst": "/b"} + assert d["completed"] is False + + def test_from_dict(self): + data = {"sequence": 1, "op_type": "fs_rm", "params": {"uri": "/x"}, "completed": True} + entry = UndoEntry.from_dict(data) + assert entry.sequence == 1 + assert entry.op_type == "fs_rm" + assert entry.completed is True + + def test_roundtrip(self): + entry = UndoEntry( + sequence=5, op_type="vectordb_upsert", params={"record_id": "r1"}, completed=True + ) + restored = UndoEntry.from_dict(entry.to_dict()) + assert restored.sequence == entry.sequence + assert restored.op_type == entry.op_type + assert restored.params == entry.params + assert restored.completed == entry.completed + + +class TestExecuteRollback: + def test_rollback_fs_mv(self): + agfs = MagicMock() + undo_log = [ + UndoEntry( + sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + ), + ] + execute_rollback(undo_log, agfs) + agfs.mv.assert_called_once_with("/b", "/a") + + def test_rollback_fs_rm_skipped(self): + agfs = MagicMock() + undo_log = [ + UndoEntry(sequence=0, op_type="fs_rm", params={"uri": "/a"}, completed=True), + ] + execute_rollback(undo_log, agfs) + agfs.mv.assert_not_called() + agfs.rm.assert_not_called() + + def test_rollback_fs_mkdir(self): + agfs = MagicMock() + undo_log = [ + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": "/a/b"}, completed=True), + ] + execute_rollback(undo_log, agfs) + agfs.rm.assert_called_once_with("/a/b") + + def test_rollback_fs_write_new(self): + agfs = MagicMock() + undo_log = [ + UndoEntry( + sequence=0, op_type="fs_write_new", params={"uri": "/a/f.txt"}, completed=True + ), + ] + execute_rollback(undo_log, agfs) + agfs.rm.assert_called_once_with("/a/f.txt", recursive=True) + + def test_rollback_vectordb_upsert(self): + agfs = MagicMock() + vector_store = AsyncMock() + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_upsert", + params={"record_id": "r1"}, + completed=True, + ), + ] + execute_rollback(undo_log, agfs, vector_store=vector_store) + vector_store.delete.assert_called_once_with(["r1"]) + + def test_rollback_vectordb_update_uri(self): + agfs = MagicMock() + ctx = MagicMock() + vector_store = AsyncMock() + undo_log = [ + UndoEntry( + sequence=0, + op_type="vectordb_update_uri", + params={ + "old_uri": "viking://a", + "new_uri": "viking://b", + "old_parent_uri": "viking://", + }, + completed=True, + ), + ] + execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + vector_store.update_uri_mapping.assert_called_once_with( + ctx=ctx, uri="viking://b", new_uri="viking://a", new_parent_uri="viking://" + ) + + def test_rollback_reverse_order(self): + """Rollback should process entries in reverse sequence order.""" + agfs = MagicMock() + call_order = [] + original_mv = agfs.mv + original_rm = agfs.rm + + def track_mv(*args): + call_order.append(("mv", args)) + return original_mv(*args) + + def track_rm(*args, **kwargs): + call_order.append(("rm", args)) + return original_rm(*args, **kwargs) + + agfs.mv = track_mv + agfs.rm = track_rm + + undo_log = [ + UndoEntry( + sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + ), + UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True), + ] + execute_rollback(undo_log, agfs) + # seq=1 should be rolled back first (mkdir→rm), then seq=0 (mv→reverse mv) + assert call_order[0][0] == "rm" + assert call_order[1][0] == "mv" + + def test_rollback_skips_incomplete(self): + agfs = MagicMock() + undo_log = [ + UndoEntry( + sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=False + ), + ] + execute_rollback(undo_log, agfs) + agfs.mv.assert_not_called() + + def test_rollback_best_effort(self): + """A failing rollback entry should not prevent others from running.""" + agfs = MagicMock() + agfs.rm.side_effect = Exception("boom") + agfs.mv = MagicMock() + + undo_log = [ + UndoEntry( + sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + ), + UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True), + ] + execute_rollback(undo_log, agfs) + # fs_mkdir rollback failed (rm raises), but fs_mv rollback should still run + agfs.mv.assert_called_once_with("/b", "/a") diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go index f2ccde99..c20fdc66 100644 --- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go @@ -24,9 +24,18 @@ type QueueBackend interface { // Enqueue adds a message to a queue Enqueue(queueName string, msg QueueMessage) error - // Dequeue removes and returns the first message from a queue + // Dequeue marks the first pending message as 'processing' and returns it. + // Call Ack after successful processing to permanently delete the message. Dequeue(queueName string) (QueueMessage, bool, error) + // Ack permanently deletes a message that has been successfully processed. + Ack(queueName string, messageID string) error + + // RecoverStale resets messages stuck in 'processing' state back to 'pending'. + // staleSec: minimum age in seconds; pass 0 to reset all processing messages. + // Returns the number of messages recovered. + RecoverStale(staleSec int64) (int, error) + // Peek returns the first message without removing it Peek(queueName string) (QueueMessage, bool, error) @@ -124,6 +133,16 @@ func (b *MemoryBackend) Dequeue(queueName string) (QueueMessage, bool, error) { return msg, true, nil } +// Ack is a no-op for the memory backend (messages are already removed on Dequeue). +func (b *MemoryBackend) Ack(queueName string, messageID string) error { + return nil +} + +// RecoverStale is a no-op for the memory backend (no persistence across restarts). +func (b *MemoryBackend) RecoverStale(staleSec int64) (int, error) { + return 0, nil +} + func (b *MemoryBackend) Peek(queueName string) (QueueMessage, bool, error) { queue, exists := b.queues[queueName] if !exists { @@ -345,6 +364,16 @@ func (b *TiDBBackend) Enqueue(queueName string, msg QueueMessage) error { return nil } +// Ack is not yet implemented for TiDB backend (messages are already soft-deleted on Dequeue). +func (b *TiDBBackend) Ack(queueName string, messageID string) error { + return nil +} + +// RecoverStale is not yet implemented for TiDB backend. +func (b *TiDBBackend) RecoverStale(staleSec int64) (int, error) { + return 0, nil +} + func (b *TiDBBackend) Dequeue(queueName string) (QueueMessage, bool, error) { // Get table name from cache (lazy loading) tableName, err := b.getTableName(queueName, false) diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go index 03b7342f..9639531c 100644 --- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go @@ -63,16 +63,22 @@ func (b *SQLiteDBBackend) GetInitSQL() []string { last_updated INTEGER DEFAULT (strftime('%s', 'now')) )`, // Queue messages table + // status: 'pending' (waiting) | 'processing' (dequeued, not yet acked) + // processing_started_at: Unix timestamp when dequeued; NULL if pending `CREATE TABLE IF NOT EXISTS queue_messages ( id INTEGER PRIMARY KEY AUTOINCREMENT, queue_name TEXT NOT NULL, message_id TEXT NOT NULL, data TEXT NOT NULL, timestamp INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + processing_started_at INTEGER, created_at INTEGER DEFAULT (strftime('%s', 'now')) )`, `CREATE INDEX IF NOT EXISTS idx_queue_name ON queue_messages(queue_name)`, `CREATE INDEX IF NOT EXISTS idx_queue_order ON queue_messages(queue_name, id)`, + `CREATE INDEX IF NOT EXISTS idx_queue_status ON queue_messages(queue_name, status, id)`, + `CREATE INDEX IF NOT EXISTS idx_queue_message_id ON queue_messages(queue_name, message_id)`, } } diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go index d8d481b0..052a8f19 100644 --- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go @@ -137,7 +137,9 @@ func (q *QueueFSPlugin) Initialize(cfg map[string]interface{}) error { switch backendType { case "memory": backend = NewMemoryBackend() - case "tidb", "mysql", "sqlite", "sqlite3": + case "sqlite", "sqlite3": + backend = NewSQLiteQueueBackend() + case "tidb", "mysql": backend = NewTiDBBackend() default: return fmt.Errorf("unsupported backend: %s", backendType) @@ -384,6 +386,7 @@ var queueOperations = map[string]bool{ "peek": true, "size": true, "clear": true, + "ack": true, // write message_id to confirm processing complete (at-least-once delivery) } // parseQueuePath parses a path like "/queue_name/operation" or "/dir/queue_name/operation" @@ -529,7 +532,7 @@ func (qfs *queueFS) Read(path string, offset int64, size int64) ([]byte, error) data, err = qfs.peek(queueName) case "size": data, err = qfs.size(queueName) - case "enqueue", "clear": + case "enqueue", "clear", "ack": // Write-only files return []byte(""), fmt.Errorf("permission denied: %s is write-only", path) default: @@ -573,6 +576,12 @@ func (qfs *queueFS) Write(path string, data []byte, offset int64, flags filesyst return 0, err } return 0, nil + case "ack": + msgID := strings.TrimSpace(string(data)) + if err := qfs.ackMessage(queueName, msgID); err != nil { + return 0, err + } + return int64(len(data)), nil default: return 0, fmt.Errorf("cannot write to: %s", path) } @@ -844,7 +853,7 @@ func (qfs *queueFS) Stat(p string) (*filesystem.FileInfo, error) { } mode := uint32(0644) - if operation == "enqueue" || operation == "clear" { + if operation == "enqueue" || operation == "clear" || operation == "ack" { mode = 0222 } else { mode = 0444 @@ -992,6 +1001,13 @@ func (qfs *queueFS) clear(queueName string) error { return qfs.plugin.backend.Clear(queueName) } +func (qfs *queueFS) ackMessage(queueName string, msgID string) error { + qfs.plugin.mu.Lock() + defer qfs.plugin.mu.Unlock() + + return qfs.plugin.backend.Ack(queueName, msgID) +} + // Ensure QueueFSPlugin implements ServicePlugin var _ plugin.ServicePlugin = (*QueueFSPlugin)(nil) var _ filesystem.FileSystem = (*queueFS)(nil) diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go new file mode 100644 index 00000000..2a0c4dbe --- /dev/null +++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go @@ -0,0 +1,321 @@ +package queuefs + +import ( + "database/sql" + "encoding/json" + "fmt" + "strings" + "time" + + log "github.com/sirupsen/logrus" +) + +// SQLiteQueueBackend implements QueueBackend using SQLite with a single-table schema. +// +// Schema: +// - queue_metadata: tracks all queues (including empty ones created via mkdir) +// - queue_messages: stores all messages, filtered by queue_name column +// - status: 'pending' (waiting to be processed) | 'processing' (dequeued, awaiting ack) +// - processing_started_at: Unix timestamp when dequeued; NULL while pending +// +// Delivery semantics: at-least-once +// - Dequeue marks message as 'processing' (does NOT delete) +// - Ack deletes the message after successful processing +// - On startup, RecoverStale resets all 'processing' messages back to 'pending' +// so that messages from a previous crashed run are automatically retried +type SQLiteQueueBackend struct { + db *sql.DB +} + +func NewSQLiteQueueBackend() *SQLiteQueueBackend { + return &SQLiteQueueBackend{} +} + +func (b *SQLiteQueueBackend) Initialize(config map[string]interface{}) error { + dbBackend := NewSQLiteDBBackend() + + db, err := dbBackend.Open(config) + if err != nil { + return fmt.Errorf("failed to open SQLite database: %w", err) + } + b.db = db + + for _, sqlStmt := range dbBackend.GetInitSQL() { + if _, err := db.Exec(sqlStmt); err != nil { + db.Close() + return fmt.Errorf("failed to initialize schema: %w", err) + } + } + + // Migrate existing databases: add new columns if they don't exist yet. + b.runMigrations() + + // Reset any messages left in 'processing' state by a previous crashed process. + // staleSec=0 resets ALL processing messages — safe at startup because no workers + // are running yet. + if n, err := b.RecoverStale(0); err != nil { + log.Warnf("[queuefs] Failed to recover stale messages on startup: %v", err) + } else if n > 0 { + log.Infof("[queuefs] Recovered %d in-flight message(s) from previous run", n) + } + + log.Info("[queuefs] SQLite backend initialized") + return nil +} + +// runMigrations applies schema changes needed to upgrade an existing database. +// Each ALTER TABLE is executed and "duplicate column name" errors are silently ignored. +func (b *SQLiteQueueBackend) runMigrations() { + migrations := []string{ + `ALTER TABLE queue_messages ADD COLUMN status TEXT NOT NULL DEFAULT 'pending'`, + `ALTER TABLE queue_messages ADD COLUMN processing_started_at INTEGER`, + `CREATE INDEX IF NOT EXISTS idx_queue_status ON queue_messages(queue_name, status, id)`, + `CREATE INDEX IF NOT EXISTS idx_queue_message_id ON queue_messages(queue_name, message_id)`, + } + for _, stmt := range migrations { + if _, err := b.db.Exec(stmt); err != nil { + // "duplicate column name" means the column already exists — that's fine. + if !strings.Contains(err.Error(), "duplicate column name") && + !strings.Contains(err.Error(), "already exists") { + log.Warnf("[queuefs] Migration warning: %v", err) + } + } + } +} + +func (b *SQLiteQueueBackend) Close() error { + if b.db != nil { + return b.db.Close() + } + return nil +} + +func (b *SQLiteQueueBackend) GetType() string { + return "sqlite" +} + +func (b *SQLiteQueueBackend) Enqueue(queueName string, msg QueueMessage) error { + msgData, err := json.Marshal(msg) + if err != nil { + return fmt.Errorf("failed to marshal message: %w", err) + } + + _, err = b.db.Exec( + "INSERT INTO queue_messages (queue_name, message_id, data, timestamp, status) VALUES (?, ?, ?, ?, 'pending')", + queueName, msg.ID, string(msgData), msg.Timestamp.Unix(), + ) + if err != nil { + return fmt.Errorf("failed to enqueue message: %w", err) + } + return nil +} + +// Dequeue marks the first pending message as 'processing' and returns it. +// The message remains in the database until Ack is called. +// If the process crashes before Ack, RecoverStale on the next startup will +// reset the message back to 'pending' so it is retried. +func (b *SQLiteQueueBackend) Dequeue(queueName string) (QueueMessage, bool, error) { + tx, err := b.db.Begin() + if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to start transaction: %w", err) + } + defer tx.Rollback() + + var id int64 + var data string + err = tx.QueryRow( + "SELECT id, data FROM queue_messages WHERE queue_name = ? AND status = 'pending' ORDER BY id LIMIT 1", + queueName, + ).Scan(&id, &data) + + if err == sql.ErrNoRows { + return QueueMessage{}, false, nil + } else if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to query message: %w", err) + } + + // Mark as processing instead of deleting. + _, err = tx.Exec( + "UPDATE queue_messages SET status = 'processing', processing_started_at = ? WHERE id = ?", + time.Now().Unix(), id, + ) + if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to mark message as processing: %w", err) + } + + if err := tx.Commit(); err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to commit transaction: %w", err) + } + + var msg QueueMessage + if err := json.Unmarshal([]byte(data), &msg); err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to unmarshal message: %w", err) + } + + return msg, true, nil +} + +// Ack deletes a message that has been successfully processed. +// Should be called after the consumer has finished processing the message. +func (b *SQLiteQueueBackend) Ack(queueName string, messageID string) error { + result, err := b.db.Exec( + "DELETE FROM queue_messages WHERE queue_name = ? AND message_id = ? AND status = 'processing'", + queueName, messageID, + ) + if err != nil { + return fmt.Errorf("failed to ack message: %w", err) + } + rows, _ := result.RowsAffected() + if rows == 0 { + log.Warnf("[queuefs] Ack found no matching processing message: queue=%s msg=%s", queueName, messageID) + } + return nil +} + +// RecoverStale resets messages stuck in 'processing' state back to 'pending'. +// staleSec is the minimum age (in seconds) of a processing message before it +// is considered stale. Pass 0 to reset ALL processing messages immediately +// (appropriate at startup before any workers have started). +// Returns the number of messages recovered. +func (b *SQLiteQueueBackend) RecoverStale(staleSec int64) (int, error) { + cutoff := time.Now().Unix() - staleSec + result, err := b.db.Exec( + "UPDATE queue_messages SET status = 'pending', processing_started_at = NULL WHERE status = 'processing' AND processing_started_at <= ?", + cutoff, + ) + if err != nil { + return 0, fmt.Errorf("failed to recover stale messages: %w", err) + } + n, _ := result.RowsAffected() + return int(n), nil +} + +func (b *SQLiteQueueBackend) Peek(queueName string) (QueueMessage, bool, error) { + var data string + err := b.db.QueryRow( + "SELECT data FROM queue_messages WHERE queue_name = ? AND status = 'pending' ORDER BY id LIMIT 1", + queueName, + ).Scan(&data) + + if err == sql.ErrNoRows { + return QueueMessage{}, false, nil + } else if err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to peek message: %w", err) + } + + var msg QueueMessage + if err := json.Unmarshal([]byte(data), &msg); err != nil { + return QueueMessage{}, false, fmt.Errorf("failed to unmarshal message: %w", err) + } + + return msg, true, nil +} + +// Size returns the number of pending (not yet dequeued) messages. +func (b *SQLiteQueueBackend) Size(queueName string) (int, error) { + var count int + err := b.db.QueryRow( + "SELECT COUNT(*) FROM queue_messages WHERE queue_name = ? AND status = 'pending'", + queueName, + ).Scan(&count) + if err != nil { + return 0, fmt.Errorf("failed to get queue size: %w", err) + } + return count, nil +} + +func (b *SQLiteQueueBackend) Clear(queueName string) error { + _, err := b.db.Exec("DELETE FROM queue_messages WHERE queue_name = ?", queueName) + if err != nil { + return fmt.Errorf("failed to clear queue: %w", err) + } + return nil +} + +func (b *SQLiteQueueBackend) ListQueues(prefix string) ([]string, error) { + var query string + var args []interface{} + + if prefix == "" { + query = "SELECT queue_name FROM queue_metadata" + } else { + query = "SELECT queue_name FROM queue_metadata WHERE queue_name = ? OR queue_name LIKE ?" + args = []interface{}{prefix, prefix + "/%"} + } + + rows, err := b.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("failed to list queues: %w", err) + } + defer rows.Close() + + var queues []string + for rows.Next() { + var qName string + if err := rows.Scan(&qName); err != nil { + return nil, fmt.Errorf("failed to scan queue name: %w", err) + } + queues = append(queues, qName) + } + return queues, nil +} + +func (b *SQLiteQueueBackend) GetLastEnqueueTime(queueName string) (time.Time, error) { + var timestamp sql.NullInt64 + err := b.db.QueryRow( + "SELECT MAX(timestamp) FROM queue_messages WHERE queue_name = ? AND status = 'pending'", + queueName, + ).Scan(×tamp) + + if err != nil || !timestamp.Valid { + return time.Time{}, nil + } + return time.Unix(timestamp.Int64, 0), nil +} + +func (b *SQLiteQueueBackend) RemoveQueue(queueName string) error { + if queueName == "" { + if _, err := b.db.Exec("DELETE FROM queue_messages"); err != nil { + return err + } + _, err := b.db.Exec("DELETE FROM queue_metadata") + return err + } + + if _, err := b.db.Exec( + "DELETE FROM queue_messages WHERE queue_name = ? OR queue_name LIKE ?", + queueName, queueName+"/%", + ); err != nil { + return fmt.Errorf("failed to remove queue messages: %w", err) + } + + _, err := b.db.Exec( + "DELETE FROM queue_metadata WHERE queue_name = ? OR queue_name LIKE ?", + queueName, queueName+"/%", + ) + return err +} + +func (b *SQLiteQueueBackend) CreateQueue(queueName string) error { + _, err := b.db.Exec( + "INSERT OR IGNORE INTO queue_metadata (queue_name) VALUES (?)", + queueName, + ) + if err != nil { + return fmt.Errorf("failed to create queue: %w", err) + } + log.Infof("[queuefs] Created queue '%s' (SQLite)", queueName) + return nil +} + +func (b *SQLiteQueueBackend) QueueExists(queueName string) (bool, error) { + var count int + err := b.db.QueryRow( + "SELECT COUNT(*) FROM queue_metadata WHERE queue_name = ?", + queueName, + ).Scan(&count) + if err != nil { + return false, fmt.Errorf("failed to check queue existence: %w", err) + } + return count > 0, nil +} From b9a51c2da9e5ed96bec56cae545c24b964c0bcb5 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Thu, 5 Mar 2026 15:54:49 +0800 Subject: [PATCH 02/18] test(transaction): add e2e rollback tests for mv and multi-step operations Add end-to-end tests covering rollback scenarios that were missing: - mv rollback: file moved back to original location on failure - mv commit: file persists at new location - Multi-step rollback: mkdir + write + mkdir all reversed in order - Partial step rollback: only completed entries are reversed - Nested directory rollback: child removed before parent - Best-effort rollback: single step failure does not block others Co-Authored-By: Claude Opus 4.6 --- tests/transaction/test_e2e.py | 199 ++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py index 88b6b5d6..8f67ea96 100644 --- a/tests/transaction/test_e2e.py +++ b/tests/transaction/test_e2e.py @@ -218,6 +218,205 @@ async def test_journal_cleaned_after_rollback(self, agfs_client, tx_manager, tes assert tx_id not in all_ids +class TestE2EMvRollback: + async def test_mv_rollback_moves_file_back(self, agfs_client, tx_manager, test_dir): + """mv commit 前失败 → 文件被移回原位。""" + src = f"{test_dir}/mv-rb-src-{uuid.uuid4().hex}" + dst_parent = f"{test_dir}/mv-rb-dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst_parent) + + # Write a file inside src + agfs_client.write(f"{src}/data.txt", b"important") + + dst = f"{dst_parent}/moved" + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent + ) as tx: + seq = tx.record_undo("fs_mv", {"src": src, "dst": dst}) + agfs_client.mv(src, dst) + tx.mark_completed(seq) + + raise RuntimeError("abort after mv") + + # src should be restored (mv reversed: dst → src) + content = agfs_client.cat(f"{src}/data.txt") + assert content == b"important" + + # dst should no longer exist + try: + agfs_client.stat(dst) + raise AssertionError("dst should not exist after rollback") + except Exception: + pass + + async def test_mv_commit_persists(self, agfs_client, tx_manager, test_dir): + """mv commit 成功 → 文件在新位置,旧位置不存在。""" + src = f"{test_dir}/mv-ok-src-{uuid.uuid4().hex}" + dst_parent = f"{test_dir}/mv-ok-dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst_parent) + agfs_client.write(f"{src}/data.txt", b"moved-data") + + dst = f"{dst_parent}/moved" + + async with TransactionContext( + tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent + ) as tx: + seq = tx.record_undo("fs_mv", {"src": src, "dst": dst}) + agfs_client.mv(src, dst) + tx.mark_completed(seq) + await tx.commit() + + # File at new location + content = agfs_client.cat(f"{dst}/data.txt") + assert content == b"moved-data" + + # Old location gone + try: + agfs_client.stat(src) + raise AssertionError("src should not exist after committed mv") + except Exception: + pass + + +class TestE2EMultiStepRollback: + async def test_multi_step_rollback_reverses_all(self, agfs_client, tx_manager, test_dir): + """多步操作(mkdir + write + mkdir),中间失败 → 全部反序回滚。 + + 执行顺序:seq0 mkdir /a → seq1 write /a/f.txt → seq2 mkdir /a/sub + 在 seq2 完成后抛异常。 + 回滚顺序:seq2 rm /a/sub → seq1 rm /a/f.txt → seq0 rm /a + """ + dir_a = f"{test_dir}/multi-a-{uuid.uuid4().hex}" + file_f = f"{dir_a}/f.txt" + dir_sub = f"{dir_a}/sub" + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "multi_step", [test_dir], lock_mode="point" + ) as tx: + s0 = tx.record_undo("fs_mkdir", {"uri": dir_a}) + agfs_client.mkdir(dir_a) + tx.mark_completed(s0) + + s1 = tx.record_undo("fs_write_new", {"uri": file_f}) + agfs_client.write(file_f, b"content") + tx.mark_completed(s1) + + s2 = tx.record_undo("fs_mkdir", {"uri": dir_sub}) + agfs_client.mkdir(dir_sub) + tx.mark_completed(s2) + + raise RuntimeError("abort after all steps") + + # Everything should be cleaned up in reverse order + for path in [dir_sub, file_f, dir_a]: + try: + agfs_client.stat(path) + raise AssertionError(f"{path} should not exist after rollback") + except Exception: + pass + + async def test_partial_step_rollback(self, agfs_client, tx_manager, test_dir): + """两步操作,第二步执行到一半崩溃(未 mark_completed)→ 只回滚第一步。 + + seq0 mkdir (completed=True) → seq1 write (completed=False,异常在 mark 前抛出) + 回滚只处理 seq0。 + """ + dir_a = f"{test_dir}/partial-{uuid.uuid4().hex}" + file_f = f"{dir_a}/f.txt" + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "partial", [test_dir], lock_mode="point" + ) as tx: + s0 = tx.record_undo("fs_mkdir", {"uri": dir_a}) + agfs_client.mkdir(dir_a) + tx.mark_completed(s0) + + _s1 = tx.record_undo("fs_write_new", {"uri": file_f}) + agfs_client.write(file_f, b"half-done") + # NOT calling tx.mark_completed(s1) — simulates crash mid-operation + raise RuntimeError("crash before marking s1 completed") + + # dir_a (seq0, completed) should be rolled back + try: + agfs_client.stat(dir_a) + raise AssertionError("dir_a should be rolled back") + except Exception: + pass + + # file_f was written but undo entry not marked completed → not rolled back by normal mode + # However, file_f is inside dir_a which was removed, so it's gone too + + async def test_rollback_order_matters_nested_dirs(self, agfs_client, tx_manager, test_dir): + """嵌套目录回滚顺序:必须先删子目录再删父目录。 + + seq0 mkdir /parent → seq1 mkdir /parent/child + 回滚必须 seq1 (rm child) → seq0 (rm parent),否则 parent 非空删除失败。 + """ + parent = f"{test_dir}/nested-parent-{uuid.uuid4().hex}" + child = f"{parent}/child" + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "nested", [test_dir], lock_mode="point" + ) as tx: + s0 = tx.record_undo("fs_mkdir", {"uri": parent}) + agfs_client.mkdir(parent) + tx.mark_completed(s0) + + s1 = tx.record_undo("fs_mkdir", {"uri": child}) + agfs_client.mkdir(child) + tx.mark_completed(s1) + + raise RuntimeError("abort nested") + + # Both gone (child first, then parent) + for path in [child, parent]: + try: + agfs_client.stat(path) + raise AssertionError(f"{path} should not exist") + except Exception: + pass + + async def test_rollback_failure_best_effort_continues(self, agfs_client, tx_manager, test_dir): + """回滚中某步失败,后续步骤仍然执行(best-effort)。 + + seq0 mkdir /a → seq1 mkdir /b + 手动删除 /b(模拟回滚 seq1 时目标已不存在),seq0 的回滚仍应执行。 + """ + dir_a = f"{test_dir}/be-a-{uuid.uuid4().hex}" + dir_b = f"{test_dir}/be-b-{uuid.uuid4().hex}" + + with pytest.raises(RuntimeError): + async with TransactionContext( + tx_manager, "best_effort", [test_dir], lock_mode="point" + ) as tx: + s0 = tx.record_undo("fs_mkdir", {"uri": dir_a}) + agfs_client.mkdir(dir_a) + tx.mark_completed(s0) + + s1 = tx.record_undo("fs_mkdir", {"uri": dir_b}) + agfs_client.mkdir(dir_b) + tx.mark_completed(s1) + + # Manually remove dir_b before rollback — simulates external interference + agfs_client.rm(dir_b) + + raise RuntimeError("abort") + + # dir_b removal during rollback "fails" (already gone), but dir_a should still be rolled back + try: + agfs_client.stat(dir_a) + raise AssertionError("dir_a should be rolled back despite dir_b failure") + except Exception: + pass + + class TestE2ESequentialTransactions: async def test_sequential_transactions_on_same_path(self, agfs_client, tx_manager, test_dir): """Two sequential transactions on the same path both succeed.""" From b52ccbfeff92a328bb551769e22cc4f3a3865239 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Fri, 13 Mar 2026 19:52:19 +0800 Subject: [PATCH 03/18] feat(storage): add transaction support with path locking and journal Implement transaction system for VikingFS with ACID-like guarantees: - TransactionManager with configurable lock timeout and journal-based recovery - PathLock supporting point, subtree, and mv lock modes - Refactor VikingFS mv to use cp+rm to prevent lock files from being carried - Fix stale lock detection returning false for missing lock files - Update ragas eval to use LangchainLLMWrapper Co-Authored-By: Claude Opus 4.6 --- openviking/async_client.py | 5 + openviking/eval/ragas/__init__.py | 17 +- openviking/service/core.py | 2 +- openviking/storage/local_fs.py | 3 +- openviking/storage/queuefs/queue_manager.py | 22 +- openviking/storage/queuefs/semantic_dag.py | 55 ++-- openviking/storage/transaction/__init__.py | 2 + .../storage/transaction/context_manager.py | 7 +- openviking/storage/transaction/journal.py | 3 +- openviking/storage/transaction/path_lock.py | 282 ++++-------------- .../transaction/transaction_manager.py | 30 +- openviking/storage/viking_fs.py | 53 +++- tests/agfs/test_fs_s3.py | 3 +- tests/client/test_resource_management.py | 2 +- tests/integration/test_add_resource_index.py | 11 +- tests/integration/test_full_workflow.py | 16 +- tests/server/test_api_filesystem.py | 45 ++- tests/server/test_api_resources.py | 4 +- tests/storage/test_semantic_dag_stats.py | 21 ++ tests/transaction/test_e2e.py | 4 +- tests/transaction/test_transaction_manager.py | 8 +- 21 files changed, 256 insertions(+), 339 deletions(-) diff --git a/openviking/async_client.py b/openviking/async_client.py index 67dfa696..3f410a13 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -96,6 +96,11 @@ async def reset(cls) -> None: await cls._instance.close() cls._instance = None + # Also reset transaction manager singleton + from openviking.storage.transaction import reset_transaction_manager + + reset_transaction_manager() + # ============= Session methods ============= def session(self, session_id: Optional[str] = None, must_exist: bool = False) -> Session: diff --git a/openviking/eval/ragas/__init__.py b/openviking/eval/ragas/__init__.py index 03336bc7..df295210 100644 --- a/openviking/eval/ragas/__init__.py +++ b/openviking/eval/ragas/__init__.py @@ -111,8 +111,8 @@ def _create_ragas_llm_from_config() -> Optional[Any]: RAGAS LLM instance or None if VLM is not configured. """ try: - from openai import OpenAI - from ragas.llms import llm_factory + from langchain_openai import ChatOpenAI + from ragas.llms import LangchainLLMWrapper except ImportError: return None @@ -124,11 +124,12 @@ def _create_ragas_llm_from_config() -> Optional[Any]: logger.info(f"Using RAGAS LLM from environment: model={model_name}, base_url={api_base}") - client = OpenAI( + openai_model = ChatOpenAI( + model=model_name, api_key=api_key, base_url=api_base, ) - return llm_factory(model_name, client=client) + return LangchainLLMWrapper(openai_model) try: from openviking_cli.utils.config import get_openviking_config @@ -151,13 +152,13 @@ def _create_ragas_llm_from_config() -> Optional[Any]: ) return None - client = OpenAI( + model_name = vlm_config.model or "gpt-4o-mini" + openai_model = ChatOpenAI( + model=model_name, api_key=vlm_config.api_key, base_url=vlm_config.api_base, ) - - model_name = vlm_config.model or "gpt-4o-mini" - return llm_factory(model_name, client=client) + return LangchainLLMWrapper(openai_model) class RagasEvaluator(BaseEvaluator): diff --git a/openviking/service/core.py b/openviking/service/core.py index 093db8f9..912f421a 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -307,7 +307,7 @@ async def initialize(self) -> None: async def close(self) -> None: """Close OpenViking and release resources.""" if self._transaction_manager: - self._transaction_manager.stop() + await self._transaction_manager.stop() self._transaction_manager = None if self._vikingdb_manager: diff --git a/openviking/storage/local_fs.py b/openviking/storage/local_fs.py index 0181c873..88a20720 100644 --- a/openviking/storage/local_fs.py +++ b/openviking/storage/local_fs.py @@ -11,6 +11,7 @@ from openviking.server.identity import RequestContext from openviking.storage.queuefs import EmbeddingQueue, get_queue_manager from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter +from openviking_cli.exceptions import NotFoundError from openviking_cli.utils.logger import get_logger from openviking_cli.utils.uri import VikingURI @@ -176,7 +177,7 @@ async def import_ovpack( f"Resource already exists at {root_uri}. Use force=True to overwrite." ) logger.info(f"[local_fs] Overwriting existing resource at {root_uri}") - except FileNotFoundError: + except NotFoundError: # Path does not exist, safe to import pass diff --git a/openviking/storage/queuefs/queue_manager.py b/openviking/storage/queuefs/queue_manager.py index b5a68af4..52b42476 100644 --- a/openviking/storage/queuefs/queue_manager.py +++ b/openviking/storage/queuefs/queue_manager.py @@ -245,9 +245,21 @@ async def process_one(data: Dict[str, Any]) -> None: await asyncio.sleep(self._poll_interval) - # Drain remaining in-flight tasks on shutdown + # Drain remaining in-flight tasks on shutdown (with timeout) if active_tasks: - await asyncio.gather(*active_tasks, return_exceptions=True) + try: + await asyncio.wait_for( + asyncio.gather(*active_tasks, return_exceptions=True), + timeout=5.0, + ) + except asyncio.TimeoutError: + logger.warning( + f"[QueueManager] Drain timeout for {queue.name}, " + f"cancelling {len(active_tasks)} in-flight task(s)" + ) + for t in active_tasks: + t.cancel() + await asyncio.gather(*active_tasks, return_exceptions=True) def stop(self) -> None: """Stop QueueManager and release resources.""" @@ -258,8 +270,10 @@ def stop(self) -> None: # Stop queue workers for stop_event in self._queue_stop_events.values(): stop_event.set() - for thread in self._queue_threads.values(): - thread.join() + for name, thread in self._queue_threads.items(): + thread.join(timeout=10.0) + if thread.is_alive(): + logger.warning(f"[QueueManager] Worker thread {name} did not exit in time") self._queue_threads.clear() self._queue_stop_events.clear() diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 0e894474..3a05487e 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -237,6 +237,28 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]: results.append(item) return results + async def _execute_overview( + self, + dir_uri: str, + file_summaries: List[Dict[str, str]], + children_abstracts: List[Dict[str, str]], + ) -> str: + """Generate overview/abstract, write files, and vectorize a directory.""" + async with self._llm_sem: + overview = await self._processor._generate_overview( + dir_uri, file_summaries, children_abstracts + ) + abstract = self._processor._extract_abstract_from_overview(overview) + await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx) + await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx) + try: + await self._processor._vectorize_directory_simple( + dir_uri, self._context_type, abstract, overview, ctx=self._ctx + ) + except Exception as e: + logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True) + return abstract + async def _overview_task(self, dir_uri: str) -> None: from openviking.storage.errors import LockAcquisitionError from openviking.storage.transaction import TransactionContext, get_transaction_manager @@ -250,43 +272,26 @@ async def _overview_task(self, dir_uri: str) -> None: children_abstracts = self._finalize_children_abstracts(node) abstract = "" - dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx) - try: + dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx) async with TransactionContext( get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point" ) as tx: - async with self._llm_sem: - overview = await self._processor._generate_overview( - dir_uri, file_summaries, children_abstracts - ) - abstract = self._processor._extract_abstract_from_overview(overview) - await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx) - await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx) - try: - await self._processor._vectorize_directory_simple( - dir_uri, self._context_type, abstract, overview, ctx=self._ctx - ) - except Exception as e: - logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True) + abstract = await self._execute_overview(dir_uri, file_summaries, children_abstracts) await tx.commit() except LockAcquisitionError: logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping") - abstract = "" except Exception as e: logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True) - abstract = "" finally: self._stats.done_nodes += 1 self._stats.in_progress_nodes = max(0, self._stats.in_progress_nodes - 1) - - parent_uri = self._parent.get(dir_uri) - if parent_uri is None: - if self._root_done: - self._root_done.set() - return - - await self._on_child_done(parent_uri, dir_uri, abstract) + parent_uri = self._parent.get(dir_uri) + if parent_uri is None: + if self._root_done: + self._root_done.set() + else: + await self._on_child_done(parent_uri, dir_uri, abstract) def get_stats(self) -> DagStats: return DagStats( diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py index 2730cd2e..afbc3e1e 100644 --- a/openviking/storage/transaction/__init__.py +++ b/openviking/storage/transaction/__init__.py @@ -13,6 +13,7 @@ TransactionManager, get_transaction_manager, init_transaction_manager, + reset_transaction_manager, ) from openviking.storage.transaction.transaction_record import ( TransactionRecord, @@ -31,4 +32,5 @@ "execute_rollback", "get_transaction_manager", "init_transaction_manager", + "reset_transaction_manager", ] diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py index 10107dde..68ad9784 100644 --- a/openviking/storage/transaction/context_manager.py +++ b/openviking/storage/transaction/context_manager.py @@ -36,12 +36,14 @@ def __init__( lock_paths: List[str], lock_mode: str = "point", mv_dst_path: Optional[str] = None, + src_is_dir: bool = True, ): self._tx_manager = tx_manager self._operation = operation self._lock_paths = lock_paths self._lock_mode = lock_mode self._mv_dst_path = mv_dst_path + self._src_is_dir = src_is_dir self._record: Optional[TransactionRecord] = None self._committed = False self._sequence = 0 @@ -81,7 +83,10 @@ async def __aenter__(self) -> "TransactionContext": if len(self._lock_paths) < 1 or not self._mv_dst_path: raise TransactionError("mv lock mode requires lock_paths[0] and mv_dst_path") success = await self._tx_manager.acquire_lock_mv( - tx_id, self._lock_paths[0], self._mv_dst_path + tx_id, + self._lock_paths[0], + self._mv_dst_path, + src_is_dir=self._src_is_dir, ) else: # "point" mode (default) diff --git a/openviking/storage/transaction/journal.py b/openviking/storage/transaction/journal.py index d641e905..6cb14474 100644 --- a/openviking/storage/transaction/journal.py +++ b/openviking/storage/transaction/journal.py @@ -10,8 +10,7 @@ import json from typing import Any, Dict, List -from pyagfs import AGFSClient - +from openviking.pyagfs import AGFSClient from openviking_cli.utils.logger import get_logger logger = get_logger(__name__) diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index 7cae0d9c..e2879694 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -1,34 +1,3 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Path lock implementation for transaction management. - -Provides path-based locking mechanism to prevent concurrent directory operations. -Lock protocol: viking://resources/.../.path.ovlock file exists = locked - -Lock files contain a fencing token in the format ``{tx_id}:{time_ns}:{lock_type}`` so that -stale locks (left by crashed processes) can be detected and removed. - -Two lock types: - POINT (P): Locks a specific directory for write/semantic operations. - Blocks if any ancestor holds a SUBTREE lock. - SUBTREE (S): Locks an entire directory subtree for rm/mv-source operations. - Blocks if any descendant holds any lock. - -Livelock prevention: after both parties write their lock files and detect a conflict, -the "later" one (larger (timestamp, tx_id)) backs off and retries. - -# TODO(multi-node): File-based locks only work correctly when all nodes share the -# same AGFS backend with strong read-write consistency. For multi-node deployments -# with replicated or partitioned storage, replace this implementation with a -# distributed lock backend (e.g. etcd txn+lease, ZooKeeper ephemeral nodes). -# The PathLock interface should be extracted to allow swappable backends. -# Key requirements for a distributed backend: -# - Atomic compare-and-set (to avoid write-write races on lock acquisition) -# - Session-bound leases (so crashed nodes auto-release without TTL polling) -# - Monotonically increasing fencing tokens (etcd revision works well) -""" - import asyncio import time from typing import Optional, Tuple @@ -51,40 +20,13 @@ def _make_fencing_token(tx_id: str, lock_type: str = LOCK_TYPE_POINT) -> str: - """Create a fencing token for a transaction. - - Format: ``{tx_id}:{time_ns}:{lock_type}`` where time_ns is the current - wall-clock time in nanoseconds and lock_type is P or S. - - Args: - tx_id: Transaction ID - lock_type: Lock type, either LOCK_TYPE_POINT ("P") or LOCK_TYPE_SUBTREE ("S") - - Returns: - Fencing token string - """ return f"{tx_id}:{time.time_ns()}:{lock_type}" def _parse_fencing_token(token: str) -> Tuple[str, int, str]: - """Parse a fencing token into (tx_id, timestamp_ns, lock_type). - - Supports: - - New format: ``{tx_id}:{time_ns}:P`` or ``{tx_id}:{time_ns}:S`` - - Legacy format: ``{tx_id}:{time_ns}`` (defaults to POINT) - - Very legacy: plain tx_id (ts=0, defaults to POINT) - - Args: - token: Fencing token string - - Returns: - (tx_id, timestamp_ns, lock_type) — timestamp_ns is 0 for legacy tokens, - lock_type defaults to LOCK_TYPE_POINT for legacy tokens. - """ - # New format ends with ":P" or ":S" if token.endswith(f":{LOCK_TYPE_POINT}") or token.endswith(f":{LOCK_TYPE_SUBTREE}"): lock_type = token[-1] - rest = token[:-2] # strip ":{lock_type}" + rest = token[:-2] idx = rest.rfind(":") if idx >= 0: tx_id_part = rest[:idx] @@ -95,7 +37,6 @@ def _parse_fencing_token(token: str) -> Tuple[str, int, str]: pass return rest, 0, lock_type - # Legacy format: {tx_id}:{time_ns} if ":" in token: idx = token.rfind(":") tx_id_part = token[:idx] @@ -109,34 +50,15 @@ def _parse_fencing_token(token: str) -> Tuple[str, int, str]: class PathLock: - """Path lock manager for transaction-based directory locking. - - Implements path-based locking using lock files (.path.ovlock) to prevent - concurrent operations on the same directory tree. - - Two lock types: - POINT (P): Used for write and semantic processing operations. - SUBTREE (S): Used for rm and mv-source operations. - """ - def __init__(self, agfs_client: AGFSClient, lock_expire: float = 300.0): - """Initialize path lock manager. - - Args: - agfs_client: AGFS client for file system operations - lock_expire: Stale lock expiry threshold in seconds (default: 300s). - Locks held longer than this by a crashed process are force-released. - """ self._agfs = agfs_client self._lock_expire = lock_expire def _get_lock_path(self, path: str) -> str: - """Get lock file path for a directory.""" path = path.rstrip("/") return f"{path}/{LOCK_FILE_NAME}" def _get_parent_path(self, path: str) -> Optional[str]: - """Get parent directory path.""" path = path.rstrip("/") if "/" not in path: return None @@ -144,17 +66,17 @@ def _get_parent_path(self, path: str) -> Optional[str]: return parent if parent else None def _read_token(self, lock_path: str) -> Optional[str]: - """Read fencing token from lock file, returning None if absent.""" try: content = self._agfs.cat(lock_path) if isinstance(content, bytes): - return content.decode("utf-8").strip() - return str(content).strip() + token = content.decode("utf-8").strip() + else: + token = str(content).strip() + return token if token else None except Exception: return None async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool: - """Check if path is locked by another transaction (any lock type).""" token = self._read_token(lock_path) if token is None: return False @@ -164,59 +86,36 @@ async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool async def _create_lock_file( self, lock_path: str, transaction_id: str, lock_type: str = LOCK_TYPE_POINT ) -> None: - """Create lock file with fencing token.""" token = _make_fencing_token(transaction_id, lock_type) self._agfs.write(lock_path, token.encode("utf-8")) async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool: - """Verify lock file is owned by current transaction.""" token = self._read_token(lock_path) if token is None: return False lock_owner, _, _ = _parse_fencing_token(token) return lock_owner == transaction_id - async def _remove_lock_file(self, lock_path: str) -> None: - """Remove lock file.""" + async def _remove_lock_file(self, lock_path: str) -> bool: try: self._agfs.rm(lock_path) - except Exception: - pass + return True + except Exception as e: + if "not found" in str(e).lower(): + return True + return False def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool: - """Check if a lock file is stale (left by a crashed process). - - A lock is considered stale if: - - The lock file does not exist (already cleaned up) - - The lock file contains a legacy token (no timestamp) - - The lock has been held longer than ``expire_seconds`` - - Args: - lock_path: Lock file path - expire_seconds: Lock expiry threshold in seconds (default: 5 minutes) - - Returns: - True if the lock is stale, False if it is still fresh - """ token = self._read_token(lock_path) if token is None: - return True # No file = stale + return True _, ts, _ = _parse_fencing_token(token) if ts == 0: - return True # Legacy format = consider stale + return True age = (time.time_ns() - ts) / 1e9 return age > expire_seconds async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> Optional[str]: - """Walk all ancestor directories and return the first SUBTREE lock held by another tx. - - Args: - path: Starting directory path (its ancestors are checked, not itself) - exclude_tx_id: Transaction ID to exclude from conflict detection - - Returns: - Lock file path of the conflicting SUBTREE lock, or None if no conflict - """ parent = self._get_parent_path(path) while parent: lock_path = self._get_lock_path(parent) @@ -229,15 +128,6 @@ async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> O return None async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Optional[str]: - """Recursively scan all descendant directories for locks held by another tx. - - Args: - path: Root directory path to scan (its own lock is NOT checked here) - exclude_tx_id: Transaction ID to exclude from conflict detection - - Returns: - Lock file path of the first conflicting lock found, or None if no conflict - """ try: entries = self._agfs.ls(path) if not isinstance(entries, list): @@ -257,7 +147,6 @@ async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Op owner_id, _, _ = _parse_fencing_token(token) if owner_id != exclude_tx_id: return subdir_lock - # Recurse into subdir result = await self._scan_descendants_for_locks(subdir, exclude_tx_id) if result: return result @@ -268,37 +157,10 @@ async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Op async def acquire_point( self, path: str, transaction: TransactionRecord, timeout: float = 0.0 ) -> bool: - """Acquire POINT lock for write/semantic-processing operations. - - A POINT lock is placed on a single directory. It conflicts with: - - Any lock (P or S) on the same directory by another transaction - - Any SUBTREE (S) lock on any ancestor directory - - Lock acquisition flow: - 1. Check target directory exists - 2. Check if target directory is locked by another transaction → wait/stale-remove - 3. Check if any ancestor holds a SUBTREE lock → wait/stale-remove - 4. Write POINT(P) lock file - 5. TOCTOU double-check: re-scan ancestors for SUBTREE locks - - Conflict found: compare (ts, tx_id); later one backs off and retries - 6. Verify lock ownership - 7. Return success - - Args: - path: Directory path to lock - transaction: Transaction record - timeout: Maximum time to wait for the lock in seconds. - 0 (default) = fail immediately if locked. - > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout. - - Returns: - True if lock acquired successfully, False if timeout exceeded - """ transaction_id = transaction.id lock_path = self._get_lock_path(path) deadline = asyncio.get_event_loop().time() + timeout - # Step 1: Check target directory exists (once, before polling) try: self._agfs.stat(path) except Exception: @@ -306,11 +168,14 @@ async def acquire_point( return False while True: - # Step 2: Check if target directory is locked by another transaction if await self._is_locked_by_other(lock_path, transaction_id): if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[POINT] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) + if asyncio.get_event_loop().time() >= deadline: + logger.warning(f"[POINT] Timeout waiting for lock on: {path}") + return False + await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_event_loop().time() >= deadline: logger.warning(f"[POINT] Timeout waiting for lock on: {path}") @@ -318,7 +183,6 @@ async def acquire_point( await asyncio.sleep(_POLL_INTERVAL) continue - # Step 3: Check all ancestors for SUBTREE locks ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id) if ancestor_conflict: if self.is_lock_stale(ancestor_conflict, self._lock_expire): @@ -326,6 +190,12 @@ async def acquire_point( f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" ) await self._remove_lock_file(ancestor_conflict) + if asyncio.get_event_loop().time() >= deadline: + logger.warning( + f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_event_loop().time() >= deadline: logger.warning( @@ -335,14 +205,12 @@ async def acquire_point( await asyncio.sleep(_POLL_INTERVAL) continue - # Step 4: Write POINT lock file try: await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_POINT) except Exception as e: logger.error(f"[POINT] Failed to create lock file: {e}") return False - # Step 5: TOCTOU double-check ancestors for SUBTREE locks backed_off = False conflict_after = await self._check_ancestors_for_subtree(path, transaction_id) if conflict_after: @@ -353,13 +221,10 @@ async def acquire_point( _, my_ts, _ = ( _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT) ) - # Later one (larger (ts, tx_id)) backs off if (my_ts, transaction_id) > (their_ts, their_tx_id): logger.debug(f"[POINT] Backing off (livelock guard) on {path}") await self._remove_lock_file(lock_path) backed_off = True - # Either: I backed off, or they will back off. - # In both cases restart the outer loop after a brief wait. if asyncio.get_event_loop().time() >= deadline: if not backed_off: await self._remove_lock_file(lock_path) @@ -367,7 +232,6 @@ async def acquire_point( await asyncio.sleep(_POLL_INTERVAL) continue - # Step 6: Verify lock ownership if not await self._verify_lock_ownership(lock_path, transaction_id): logger.debug(f"[POINT] Lock ownership verification failed: {path}") if asyncio.get_event_loop().time() >= deadline: @@ -375,7 +239,6 @@ async def acquire_point( await asyncio.sleep(_POLL_INTERVAL) continue - # Success transaction.add_lock(lock_path) logger.debug(f"[POINT] Lock acquired: {lock_path}") return True @@ -383,38 +246,10 @@ async def acquire_point( async def acquire_subtree( self, path: str, transaction: TransactionRecord, timeout: float = 0.0 ) -> bool: - """Acquire SUBTREE lock for rm/mv-source operations. - - A SUBTREE lock is placed on a single directory (the root of the subtree). - It conflicts with: - - Any lock (P or S) on the same directory by another transaction - - Any lock (P or S) on any descendant directory by another transaction - - Lock acquisition flow: - 1. Check target directory exists - 2. Check if target directory is locked by another transaction → wait/stale-remove - 3. Scan all descendants for any locks → wait/stale-remove - 4. Write SUBTREE(S) lock file (only one file, at the root path) - 5. TOCTOU double-check: re-scan descendants for any new locks - - Conflict found: compare (ts, tx_id); later one backs off and retries - 6. Verify lock ownership - 7. Return success - - Args: - path: Directory path to lock (root of the subtree) - transaction: Transaction record - timeout: Maximum time to wait for the lock in seconds. - 0 (default) = fail immediately if locked. - > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout. - - Returns: - True if lock acquired successfully, False if timeout exceeded - """ transaction_id = transaction.id lock_path = self._get_lock_path(path) deadline = asyncio.get_event_loop().time() + timeout - # Step 1: Check target directory exists try: self._agfs.stat(path) except Exception: @@ -422,11 +257,14 @@ async def acquire_subtree( return False while True: - # Step 2: Check if target directory is locked by another transaction if await self._is_locked_by_other(lock_path, transaction_id): if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) + if asyncio.get_event_loop().time() >= deadline: + logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") + return False + await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_event_loop().time() >= deadline: logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") @@ -434,12 +272,17 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - # Step 3: Scan all descendants for any locks by other transactions desc_conflict = await self._scan_descendants_for_locks(path, transaction_id) if desc_conflict: if self.is_lock_stale(desc_conflict, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}") await self._remove_lock_file(desc_conflict) + if asyncio.get_event_loop().time() >= deadline: + logger.warning( + f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_event_loop().time() >= deadline: logger.warning( @@ -449,14 +292,12 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - # Step 4: Write SUBTREE lock file (only one file) try: await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_SUBTREE) except Exception as e: logger.error(f"[SUBTREE] Failed to create lock file: {e}") return False - # Step 5: TOCTOU double-check descendants backed_off = False conflict_after = await self._scan_descendants_for_locks(path, transaction_id) if conflict_after: @@ -467,13 +308,10 @@ async def acquire_subtree( _, my_ts, _ = ( _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE) ) - # Later one (larger (ts, tx_id)) backs off if (my_ts, transaction_id) > (their_ts, their_tx_id): logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}") await self._remove_lock_file(lock_path) backed_off = True - # Either: I backed off, or they will back off. - # In both cases restart the outer loop after a brief wait. if asyncio.get_event_loop().time() >= deadline: if not backed_off: await self._remove_lock_file(lock_path) @@ -481,7 +319,6 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - # Step 6: Verify lock ownership if not await self._verify_lock_ownership(lock_path, transaction_id): logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}") if asyncio.get_event_loop().time() >= deadline: @@ -489,7 +326,6 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - # Success transaction.add_lock(lock_path) logger.debug(f"[SUBTREE] Lock acquired: {lock_path}") return True @@ -500,46 +336,30 @@ async def acquire_mv( dst_path: str, transaction: TransactionRecord, timeout: float = 0.0, + src_is_dir: bool = True, ) -> bool: - """Acquire path lock for mv operation. - - Lock acquisition flow for mv operations: - 1. Acquire SUBTREE lock on source directory - 2. Acquire POINT lock on destination parent directory - - Args: - src_path: Source directory path - dst_path: Destination parent directory path - transaction: Transaction record - timeout: Maximum time to wait for each lock in seconds. - 0 (default) = fail immediately if locked. - > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout. - - Returns: - True if all locks acquired successfully, False otherwise - """ - # Step 1: Lock source directory with SUBTREE lock - if not await self.acquire_subtree(src_path, transaction, timeout=timeout): - logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}") - return False - - # Step 2: Lock destination parent directory with POINT lock - if not await self.acquire_point(dst_path, transaction, timeout=timeout): - logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}") - # Release source lock - await self.release(transaction) - return False + if src_is_dir: + if not await self.acquire_subtree(src_path, transaction, timeout=timeout): + logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}") + return False + if not await self.acquire_subtree(dst_path, transaction, timeout=timeout): + logger.warning(f"[MV] Failed to acquire SUBTREE lock on destination: {dst_path}") + await self.release(transaction) + return False + else: + src_parent = src_path.rsplit("/", 1)[0] if "/" in src_path else src_path + if not await self.acquire_point(src_parent, transaction, timeout=timeout): + logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}") + return False + if not await self.acquire_point(dst_path, transaction, timeout=timeout): + logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}") + await self.release(transaction) + return False logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}") return True async def release(self, transaction: TransactionRecord) -> None: - """Release all locks held by the transaction. - - Args: - transaction: Transaction record - """ - # Release locks in reverse order (LIFO) for lock_path in reversed(transaction.locks): await self._remove_lock_file(lock_path) transaction.remove_lock(lock_path) diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index d83d8464..28dfe64d 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -100,7 +100,7 @@ async def start(self) -> None: logger.info("TransactionManager started") - def stop(self) -> None: + async def stop(self) -> None: """Stop transaction manager. Stops the background cleanup task and releases all resources. @@ -114,11 +114,17 @@ def stop(self) -> None: # Cancel cleanup task if self._cleanup_task: self._cleanup_task.cancel() + try: + await self._cleanup_task + except asyncio.CancelledError: + pass self._cleanup_task = None - # Release all active transactions + # Release all active transactions' locks for tx_id in list(self._transactions.keys()): - self._transactions.pop(tx_id, None) + tx = self._transactions.pop(tx_id, None) + if tx: + await self._path_lock.release(tx) logger.info("TransactionManager stopped") @@ -508,14 +514,16 @@ async def acquire_lock_mv( src_path: str, dst_path: str, timeout: Optional[float] = None, + src_is_dir: bool = True, ) -> bool: """Acquire path lock for mv operation. Args: transaction_id: Transaction ID - src_path: Source directory path + src_path: Source path dst_path: Destination parent directory path timeout: Maximum time to wait for each lock in seconds (default: from config) + src_is_dir: Whether the source is a directory Returns: True if lock acquired successfully, False otherwise @@ -528,7 +536,7 @@ async def acquire_lock_mv( tx.update_status(TransactionStatus.AQUIRE) effective_timeout = timeout if timeout is not None else self._lock_timeout success = await self._path_lock.acquire_mv( - src_path, dst_path, tx, timeout=effective_timeout + src_path, dst_path, tx, timeout=effective_timeout, src_is_dir=src_is_dir ) if success: @@ -603,3 +611,15 @@ def get_transaction_manager() -> Optional[TransactionManager]: TransactionManager instance or None if not initialized """ return _transaction_manager + + +def reset_transaction_manager() -> None: + """Reset the transaction manager singleton (for testing). + + This function should ONLY be used in tests to clean up state between tests. + It clears the global singleton instance without performing cleanup - make sure + to call stop() first if the manager is still running. + """ + global _transaction_manager + with _lock: + _transaction_manager = None diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index f7395238..0e954ca9 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -324,9 +324,10 @@ async def mv( ) -> Dict[str, Any]: """Move file/directory + recursively update vector index. - Wrapped in a transaction: performs FS mv first, then VectorDB URI update. - On rollback, the file is moved back and VectorDB mappings are restored. + Implemented as cp + rm to avoid lock files being carried by FS mv. + On rollback, the copy is deleted and the source remains intact. """ + from openviking.pyagfs.helpers import cp as agfs_cp from openviking.storage.transaction import TransactionContext, get_transaction_manager self._ensure_access(old_uri, ctx) @@ -350,30 +351,43 @@ async def mv( logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") raise - # Verify source exists before locking + # Verify source exists and determine type before locking try: - self.agfs.stat(old_path) + stat = self.agfs.stat(old_path) + is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False except Exception: raise FileNotFoundError(f"mv source not found: {old_uri}") - # Lock source and destination's parent (dst doesn't exist yet) dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path async with TransactionContext( - tx_manager, "mv", [old_path], lock_mode="mv", mv_dst_path=dst_parent + tx_manager, + "mv", + [old_path], + lock_mode="mv", + mv_dst_path=dst_parent, + src_is_dir=is_dir, ) as tx: - # Step 1: FS move - seq_mv = tx.record_undo("fs_mv", {"src": old_path, "dst": new_path}) + # Step 1: Copy source to destination + seq_cp = tx.record_undo("fs_write_new", {"uri": new_path}) try: - result = self.agfs.mv(old_path, new_path) - except AGFSHTTPError as e: - if e.status_code == 404: + agfs_cp(self.agfs, old_path, new_path, recursive=is_dir) + except Exception as e: + if "not found" in str(e).lower(): await self._delete_from_vector_store(uris_to_move, ctx=ctx) logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") raise - tx.mark_completed(seq_mv) + tx.mark_completed(seq_cp) + + # Step 2: Remove carried lock file from the copy (directory only) + if is_dir: + carried_lock = new_path.rstrip("/") + "/.path.ovlock" + try: + self.agfs.rm(carried_lock) + except Exception: + pass - # Step 2: Update VectorDB URIs + # Step 3: Update VectorDB URIs old_uri_stripped = old_uri.rstrip("/") old_parent_uri = ( old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else "" @@ -390,8 +404,13 @@ async def mv( await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) tx.mark_completed(seq_vdb) + # Step 4: Remove source (lock file gets deleted along with it) + seq_rm = tx.record_undo("fs_rm", {"uri": old_path, "recursive": is_dir}) + self.agfs.rm(old_path, recursive=is_dir) + tx.mark_completed(seq_rm) + await tx.commit() - return result + return {} async def grep( self, @@ -1380,6 +1399,12 @@ async def read_file( """ self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) + # Verify the file exists before reading, because AGFS read returns + # empty bytes for non-existent files instead of raising an error. + try: + self.agfs.stat(path) + except Exception: + raise NotFoundError(uri, "file") try: content = self.agfs.read(path) except Exception: diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py index 330c7089..ff9647e4 100644 --- a/tests/agfs/test_fs_s3.py +++ b/tests/agfs/test_fs_s3.py @@ -46,7 +46,8 @@ def load_agfs_config() -> AGFSConfig: AGFS_CONF = load_agfs_config() -AGFS_CONF.mode = "http-client" +if AGFS_CONF is not None: + AGFS_CONF.mode = "http-client" # 2. Skip tests if no S3 config found or backend is not S3 pytestmark = pytest.mark.skipif( diff --git a/tests/client/test_resource_management.py b/tests/client/test_resource_management.py index 294ce8d4..8a24c8e0 100644 --- a/tests/client/test_resource_management.py +++ b/tests/client/test_resource_management.py @@ -51,7 +51,7 @@ async def test_add_resource_with_target( """Test adding resource to specified target""" result = await client.add_resource( path=str(sample_markdown_file), - target="viking://resources/custom/", + to="viking://resources/custom/sample", reason="Test resource", ) diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py index 32421e69..27d6e234 100644 --- a/tests/integration/test_add_resource_index.py +++ b/tests/integration/test_add_resource_index.py @@ -1,10 +1,8 @@ -import pytest -import asyncio -import os import json -import shutil -from pathlib import Path -from unittest.mock import MagicMock, AsyncMock, patch +import os +from unittest.mock import AsyncMock, patch + +import pytest from openviking.async_client import AsyncOpenViking from openviking_cli.utils.config.open_viking_config import OpenVikingConfigSingleton @@ -96,6 +94,7 @@ async def test_add_resource_indexing_logic(test_config, tmp_path): patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs), patch("openviking.agfs_manager.AGFSManager.start"), patch("openviking.agfs_manager.AGFSManager.stop"), + patch("openviking.storage.transaction.get_transaction_manager", return_value=None), ): mock_summarize.return_value = {"status": "success"} diff --git a/tests/integration/test_full_workflow.py b/tests/integration/test_full_workflow.py index 3f86b559..823cefd7 100644 --- a/tests/integration/test_full_workflow.py +++ b/tests/integration/test_full_workflow.py @@ -67,11 +67,17 @@ async def test_add_search_read_workflow( # 3. Read searched resource if search_result.resources: - res = await client.tree(search_result.resources[0].uri) - for data in res: - if not data["isDir"]: - content = await client.read(data["uri"]) - assert len(content) > 0 + uri = search_result.resources[0].uri + info = await client.stat(uri) + if info.get("isDir"): + res = await client.tree(uri) + for data in res: + if not data["isDir"]: + content = await client.read(data["uri"]) + assert len(content) > 0 + else: + content = await client.read(uri) + assert len(content) > 0 class TestSessionWorkflow: diff --git a/tests/server/test_api_filesystem.py b/tests/server/test_api_filesystem.py index 79058d37..3a0da611 100644 --- a/tests/server/test_api_filesystem.py +++ b/tests/server/test_api_filesystem.py @@ -66,14 +66,6 @@ async def test_tree(client: httpx.AsyncClient): assert body["status"] == "ok" -async def test_stat_after_add_resource(client_with_resource): - client, uri = client_with_resource - resp = await client.get("/api/v1/fs/stat", params={"uri": uri}) - assert resp.status_code == 200 - body = resp.json() - assert body["status"] == "ok" - - async def test_stat_not_found(client: httpx.AsyncClient): resp = await client.get( "/api/v1/fs/stat", @@ -84,18 +76,28 @@ async def test_stat_not_found(client: httpx.AsyncClient): assert body["status"] == "error" -async def test_rm_resource(client_with_resource): +async def test_resource_ops(client_with_resource): + """Test stat, ls_recursive, mv, rm on a single shared resource.""" + import uuid + client, uri = client_with_resource - resp = await client.request("DELETE", "/api/v1/fs", params={"uri": uri, "recursive": True}) + + # stat + resp = await client.get("/api/v1/fs/stat", params={"uri": uri}) assert resp.status_code == 200 assert resp.json()["status"] == "ok" + # ls recursive + resp = await client.get( + "/api/v1/fs/ls", + params={"uri": "viking://", "recursive": True}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "ok" + assert isinstance(body["result"], list) -async def test_mv_resource(client_with_resource): - import uuid - - client, uri = client_with_resource - # Use a unique name to avoid conflicts with leftover data + # mv unique = uuid.uuid4().hex[:8] new_uri = uri.rstrip("/") + f"_mv_{unique}/" resp = await client.post( @@ -105,14 +107,7 @@ async def test_mv_resource(client_with_resource): assert resp.status_code == 200 assert resp.json()["status"] == "ok" - -async def test_ls_recursive(client_with_resource): - client, _ = client_with_resource - resp = await client.get( - "/api/v1/fs/ls", - params={"uri": "viking://", "recursive": True}, - ) + # rm (on the moved uri) + resp = await client.request("DELETE", "/api/v1/fs", params={"uri": new_uri, "recursive": True}) assert resp.status_code == 200 - body = resp.json() - assert body["status"] == "ok" - assert isinstance(body["result"], list) + assert resp.json()["status"] == "ok" diff --git a/tests/server/test_api_resources.py b/tests/server/test_api_resources.py index 013c6baa..16ed1a71 100644 --- a/tests/server/test_api_resources.py +++ b/tests/server/test_api_resources.py @@ -5,8 +5,6 @@ import httpx -from tests.server.conftest import SAMPLE_MD_CONTENT - async def test_add_resource_success(client: httpx.AsyncClient, sample_markdown_file): resp = await client.post( @@ -55,7 +53,7 @@ async def test_add_resource_with_target(client: httpx.AsyncClient, sample_markdo "/api/v1/resources", json={ "path": str(sample_markdown_file), - "target": "viking://resources/custom/", + "to": "viking://resources/custom/sample", "reason": "test resource", }, ) diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py index 10f06c22..202db790 100644 --- a/tests/storage/test_semantic_dag_stats.py +++ b/tests/storage/test_semantic_dag_stats.py @@ -1,6 +1,8 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 +from unittest.mock import AsyncMock, MagicMock + import pytest from openviking.server.identity import RequestContext, Role @@ -19,6 +21,9 @@ async def ls(self, uri, ctx=None): async def write_file(self, path, content, ctx=None): self.writes.append((path, content)) + def _uri_to_path(self, uri, ctx=None): + return uri.replace("viking://", "/local/acc1/") + class _FakeProcessor: def __init__(self): @@ -59,6 +64,22 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch): fake_fs = _FakeVikingFS(tree) monkeypatch.setattr("openviking.storage.queuefs.semantic_dag.get_viking_fs", lambda: fake_fs) + # Mock transaction layer: TransactionContext as no-op passthrough + mock_tx = MagicMock() + mock_tx.commit = AsyncMock() + monkeypatch.setattr( + "openviking.storage.transaction.context_manager.TransactionContext.__aenter__", + AsyncMock(return_value=mock_tx), + ) + monkeypatch.setattr( + "openviking.storage.transaction.context_manager.TransactionContext.__aexit__", + AsyncMock(return_value=False), + ) + monkeypatch.setattr( + "openviking.storage.transaction.transaction_manager.get_transaction_manager", + lambda: MagicMock(), + ) + processor = _FakeProcessor() ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER) executor = SemanticDagExecutor( diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py index 8f67ea96..d498fb59 100644 --- a/tests/transaction/test_e2e.py +++ b/tests/transaction/test_e2e.py @@ -24,8 +24,8 @@ def tx_manager(agfs_client): agfs_client=agfs_client, timeout=3600, max_parallel_locks=8, - lock_timeout=5.0, - lock_expire=300.0, + lock_timeout=1.0, + lock_expire=1.0, ) return manager diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py index ab9d5256..3d6fd198 100644 --- a/tests/transaction/test_transaction_manager.py +++ b/tests/transaction/test_transaction_manager.py @@ -283,20 +283,20 @@ async def test_start_idempotent(self): await manager.start() await manager.start() # Should not error assert manager._running is True - manager.stop() + await manager.stop() async def test_stop_clears_state(self): manager, _ = _make_manager() await manager.start() manager.create_transaction() - manager.stop() + await manager.stop() assert manager._running is False assert manager.get_transaction_count() == 0 async def test_stop_idempotent(self): manager, _ = _make_manager() - manager.stop() - manager.stop() # Should not error + await manager.stop() + await manager.stop() # Should not error class TestTimeoutCleanup: From 6cab58fb990f093cf337a1f7d07c7594003b9115 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 14:28:09 +0800 Subject: [PATCH 04/18] fix: tests --- docs/en/concepts/09-transaction.md | 8 ++-- docs/zh/concepts/09-transaction.md | 8 ++-- openviking/service/core.py | 1 + openviking/storage/local_fs.py | 5 ++- .../storage/observers/transaction_observer.py | 6 +-- openviking/storage/queuefs/semantic_dag.py | 2 + .../storage/queuefs/semantic_processor.py | 2 + .../storage/transaction/context_manager.py | 2 +- openviking/storage/transaction/path_lock.py | 26 +++++++++++- .../transaction/transaction_manager.py | 41 +++++++++++++++---- .../storage/transaction/transaction_record.py | 4 +- openviking/storage/viking_fs.py | 13 ------ .../test_hierarchical_retriever_rerank.py | 4 +- tests/server/conftest.py | 21 ++++++++++ tests/transaction/test_context_manager.py | 4 +- tests/transaction/test_crash_recovery.py | 6 +-- tests/transaction/test_e2e.py | 4 +- tests/transaction/test_path_lock.py | 6 +-- tests/transaction/test_transaction_manager.py | 2 +- 19 files changed, 115 insertions(+), 50 deletions(-) diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md index 65ec4c3b..3469ed2f 100644 --- a/docs/en/concepts/09-transaction.md +++ b/docs/en/concepts/09-transaction.md @@ -251,7 +251,7 @@ Contains: transaction ID, status, lock paths, init_info, undo_log, post_actions. ``` Create transaction -> write journal (INIT) -Acquire lock -> update journal (AQUIRE -> EXEC) +Acquire lock -> update journal (ACQUIRE -> EXEC) Execute changes -> update journal per step (mark undo entry completed) Commit -> update journal (COMMIT + post_actions) -> execute post_actions -> release locks -> delete journal @@ -267,7 +267,7 @@ Rollback -> execute undo log -> release locks -> delete journal | `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal | | `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal | | `EXEC` / `FAIL` / `RELEASING` | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal | -| `INIT` / `AQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) | +| `INIT` / `ACQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) | ### Defense Summary @@ -283,13 +283,13 @@ Rollback -> execute undo log -> release locks -> delete journal ## Transaction State Machine ``` -INIT -> AQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED +INIT -> ACQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED | FAIL -> RELEASING -> RELEASED ``` - `INIT`: Transaction created, waiting for lock -- `AQUIRE`: Acquiring lock +- `ACQUIRE`: Acquiring lock - `EXEC`: Transaction operations executing - `COMMIT`: Committed, post_actions may be pending - `FAIL`: Execution failed, entering rollback diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 99723042..6397cd2d 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -251,7 +251,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as ``` 创建事务 → 写 journal(INIT) -获取锁 → 更新 journal(AQUIRE → EXEC) +获取锁 → 更新 journal(ACQUIRE → EXEC) 执行变更 → 每步更新 journal(标记 undo entry completed) 提交 → 更新 journal(COMMIT + post_actions) → 执行 post_actions → 删锁 → 删 journal @@ -267,7 +267,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as | `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal | | `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal | | `EXEC` / `FAIL` / `RELEASING` | 执行 undo log 回滚(`recover_all=True`) → 删锁 → 删 journal | -| `INIT` / `AQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal(变更未执行) | +| `INIT` / `ACQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal(变更未执行) | ### 防线总结 @@ -283,13 +283,13 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as ## 事务状态机 ``` -INIT → AQUIRE → EXEC → COMMIT → RELEASING → RELEASED +INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED ↓ FAIL → RELEASING → RELEASED ``` - `INIT`:事务已创建,等待锁获取 -- `AQUIRE`:正在获取锁 +- `ACQUIRE`:正在获取锁 - `EXEC`:事务操作执行中 - `COMMIT`:已提交,可能有 post_actions 待执行 - `FAIL`:执行失败,进入回滚 diff --git a/openviking/service/core.py b/openviking/service/core.py index a43da906..b86f9697 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -149,6 +149,7 @@ def _init_storage( max_parallel_locks=tx_cfg.max_parallel_locks, lock_timeout=tx_cfg.lock_timeout, lock_expire=tx_cfg.lock_expire, + vector_store=self._vikingdb_manager, ) @property diff --git a/openviking/storage/local_fs.py b/openviking/storage/local_fs.py index 40497487..3d23566d 100644 --- a/openviking/storage/local_fs.py +++ b/openviking/storage/local_fs.py @@ -205,9 +205,10 @@ async def import_ovpack( if not zip_path: continue - # Normalize path separators to handle Windows-created ZIPs - zip_path = zip_path.replace("\\", "/") + # Validate before normalization so backslash paths are rejected safe_zip_path = _validate_ovpack_member_path(zip_path, base_name) + # Normalize path separators to handle Windows-created ZIPs + safe_zip_path = safe_zip_path.replace("\\", "/") # Handle directory entries if safe_zip_path.endswith("/"): diff --git a/openviking/storage/observers/transaction_observer.py b/openviking/storage/observers/transaction_observer.py index dce4555d..e29b7665 100644 --- a/openviking/storage/observers/transaction_observer.py +++ b/openviking/storage/observers/transaction_observer.py @@ -81,7 +81,7 @@ def _format_status_as_table(self, transactions: Dict[str, Any]) -> str: # Group transactions by status status_counts = { TransactionStatus.INIT: 0, - TransactionStatus.AQUIRE: 0, + TransactionStatus.ACQUIRE: 0, TransactionStatus.EXEC: 0, TransactionStatus.COMMIT: 0, TransactionStatus.FAIL: 0, @@ -107,7 +107,7 @@ def _format_status_as_table(self, transactions: Dict[str, Any]) -> str: status_priority = { TransactionStatus.EXEC: 0, - TransactionStatus.AQUIRE: 1, + TransactionStatus.ACQUIRE: 1, TransactionStatus.RELEASING: 2, TransactionStatus.INIT: 3, TransactionStatus.COMMIT: 4, @@ -206,7 +206,7 @@ def get_status_summary(self) -> Dict[str, int]: summary = { "INIT": 0, - "AQUIRE": 0, + "ACQUIRE": 0, "EXEC": 0, "COMMIT": 0, "FAIL": 0, diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index aa6ab5d3..b792833d 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -279,6 +279,8 @@ async def _overview_task(self, dir_uri: str) -> None: abstract = "" try: dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx) + # No undo entries recorded: semantic files (.overview.md / .abstract.md) are + # regenerable, so residual writes after a crash are acceptable. async with TransactionContext( get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point" ) as tx: diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index b595c4e1..e7e108f2 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -335,6 +335,8 @@ async def _process_single_directory( dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx) try: + # No undo entries recorded: semantic files (.overview.md / .abstract.md) are + # regenerable, so residual writes after a crash are acceptable. async with TransactionContext( get_transaction_manager(), "semantic", [dir_path], lock_mode="point" ) as tx: diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py index 68ad9784..5d63658e 100644 --- a/openviking/storage/transaction/context_manager.py +++ b/openviking/storage/transaction/context_manager.py @@ -145,7 +145,7 @@ def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None: self.record.post_actions.append({"type": action_type, "params": params}) async def commit(self) -> None: - self._committed = True success = await self._tx_manager.commit(self._record.id) if not success: raise TransactionError(f"Failed to commit transaction {self._record.id}") + self._committed = True diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index e2879694..856a288c 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -272,6 +272,29 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue + # Check ancestor paths for SUBTREE locks held by other transactions + ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id) + if ancestor_conflict: + if self.is_lock_stale(ancestor_conflict, self._lock_expire): + logger.warning( + f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" + ) + await self._remove_lock_file(ancestor_conflict) + if asyncio.get_event_loop().time() >= deadline: + logger.warning( + f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + if asyncio.get_event_loop().time() >= deadline: + logger.warning( + f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" + ) + return False + await asyncio.sleep(_POLL_INTERVAL) + continue + desc_conflict = await self._scan_descendants_for_locks(path, transaction_id) if desc_conflict: if self.is_lock_stale(desc_conflict, self._lock_expire): @@ -360,8 +383,9 @@ async def acquire_mv( return True async def release(self, transaction: TransactionRecord) -> None: + lock_count = len(transaction.locks) for lock_path in reversed(transaction.locks): await self._remove_lock_file(lock_path) transaction.remove_lock(lock_path) - logger.debug(f"Released {len(transaction.locks)} locks for transaction {transaction.id}") + logger.debug(f"Released {lock_count} locks for transaction {transaction.id}") diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index 28dfe64d..9cd0cd9f 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -44,6 +44,7 @@ def __init__( max_parallel_locks: int = 8, lock_timeout: float = 0.0, lock_expire: float = 300.0, + vector_store: Optional[Any] = None, ): """Initialize transaction manager. @@ -55,6 +56,7 @@ def __init__( 0 (default) = fail immediately if locked. > 0 = wait/retry up to this many seconds. lock_expire: Stale lock expiry threshold in seconds (default: 300s). + vector_store: Optional vector store for VectorDB rollback operations. """ from openviking.storage.transaction.journal import TransactionJournal @@ -62,6 +64,7 @@ def __init__( self._timeout = timeout self._max_parallel_locks = max_parallel_locks self._lock_timeout = lock_timeout + self._vector_store = vector_store self._path_lock = PathLock(agfs_client, lock_expire=lock_expire) self._journal = TransactionJournal(agfs_client) @@ -205,7 +208,7 @@ async def _recover_one(self, tx_id: str) -> None: await self._execute_post_actions(tx.post_actions) except Exception as e: logger.warning(f"Post-action replay failed for tx {tx_id}: {e}") - elif tx.status in (TransactionStatus.INIT, TransactionStatus.AQUIRE): + elif tx.status in (TransactionStatus.INIT, TransactionStatus.ACQUIRE): # Transaction never executed any operations — nothing to rollback. # However, locks may have been created before the journal was updated # with the actual locks list. Use init_info.lock_paths to find and @@ -218,7 +221,12 @@ async def _recover_one(self, tx_id: str) -> None: # Pass recover_all=True so partial (completed=False) ops are also reversed, # e.g. a directory mv that started but never finished still leaves residue. try: - execute_rollback(tx.undo_log, self._agfs, recover_all=True) + execute_rollback( + tx.undo_log, + self._agfs, + vector_store=self._vector_store, + recover_all=True, + ) except Exception as e: logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}") @@ -306,7 +314,7 @@ async def begin(self, transaction_id: str) -> bool: logger.error(f"Transaction not found: {transaction_id}") return False - tx.update_status(TransactionStatus.AQUIRE) + tx.update_status(TransactionStatus.ACQUIRE) logger.debug(f"Transaction begun: {transaction_id}") return True @@ -389,7 +397,11 @@ async def rollback(self, transaction_id: str) -> bool: # Execute undo log (best-effort) if tx.undo_log: try: - execute_rollback(tx.undo_log, self._agfs) + execute_rollback( + tx.undo_log, + self._agfs, + vector_store=self._vector_store, + ) except Exception as e: logger.warning( f"Undo log execution failed during rollback of {transaction_id}: {e}" @@ -447,10 +459,20 @@ async def _post_enqueue_semantic(self, params: Dict[str, Any]) -> None: uri = params.get("uri") context_type = params.get("context_type", "resource") account_id = params.get("account_id", "default") + user_id = params.get("user_id", "default") + agent_id = params.get("agent_id", "default") + role = params.get("role", "root") if not uri: return - msg = SemanticMsg(uri=uri, context_type=context_type, account_id=account_id) + msg = SemanticMsg( + uri=uri, + context_type=context_type, + account_id=account_id, + user_id=user_id, + agent_id=agent_id, + role=role, + ) semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC) await semantic_queue.enqueue(msg) @@ -469,7 +491,7 @@ async def acquire_lock_point(self, transaction_id: str, path: str) -> bool: logger.error(f"Transaction not found: {transaction_id}") return False - tx.update_status(TransactionStatus.AQUIRE) + tx.update_status(TransactionStatus.ACQUIRE) success = await self._path_lock.acquire_point(path, tx, timeout=self._lock_timeout) if success: @@ -497,7 +519,7 @@ async def acquire_lock_subtree( logger.error(f"Transaction not found: {transaction_id}") return False - tx.update_status(TransactionStatus.AQUIRE) + tx.update_status(TransactionStatus.ACQUIRE) effective_timeout = timeout if timeout is not None else self._lock_timeout success = await self._path_lock.acquire_subtree(path, tx, timeout=effective_timeout) @@ -533,7 +555,7 @@ async def acquire_lock_mv( logger.error(f"Transaction not found: {transaction_id}") return False - tx.update_status(TransactionStatus.AQUIRE) + tx.update_status(TransactionStatus.ACQUIRE) effective_timeout = timeout if timeout is not None else self._lock_timeout success = await self._path_lock.acquire_mv( src_path, dst_path, tx, timeout=effective_timeout, src_is_dir=src_is_dir @@ -569,6 +591,7 @@ def init_transaction_manager( max_parallel_locks: int = 8, lock_timeout: float = 0.0, lock_expire: float = 300.0, + vector_store: Optional[Any] = None, ) -> TransactionManager: """Initialize transaction manager singleton. @@ -580,6 +603,7 @@ def init_transaction_manager( 0 (default) = fail immediately if locked. > 0 = wait/retry up to this many seconds. lock_expire: Stale lock expiry threshold in seconds (default: 300s). + vector_store: Optional vector store for VectorDB rollback operations. Returns: TransactionManager instance @@ -598,6 +622,7 @@ def init_transaction_manager( max_parallel_locks=max_parallel_locks, lock_timeout=lock_timeout, lock_expire=lock_expire, + vector_store=vector_store, ) logger.info("TransactionManager initialized as singleton") diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py index c73775de..b9eb0656 100644 --- a/openviking/storage/transaction/transaction_record.py +++ b/openviking/storage/transaction/transaction_record.py @@ -16,11 +16,11 @@ class TransactionStatus(str, Enum): """Transaction status enumeration. - Status machine: INIT -> AQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED + Status machine: INIT -> ACQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED """ INIT = "INIT" # Transaction initialized, waiting for lock acquisition - AQUIRE = "AQUIRE" # Acquiring lock resources + ACQUIRE = "ACQUIRE" # Acquiring lock resources EXEC = "EXEC" # Transaction operation in progress COMMIT = "COMMIT" # Transaction completed successfully FAIL = "FAIL" # Transaction failed diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 47368f8e..56621f60 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -1172,19 +1172,6 @@ def _handle_agfs_content(self, result: Union[bytes, Any, None]) -> str: return str(result) except Exception: return "" - """Handle AGFSClient content return types consistently.""" - if isinstance(result, bytes): - return result.decode("utf-8") - elif hasattr(result, "content"): - return result.content.decode("utf-8") - elif result is None: - return "" - else: - # Try to convert to string - try: - return str(result) - except Exception: - return "" def _infer_context_type(self, uri: str): """Infer context_type from URI. Returns None when ambiguous.""" diff --git a/tests/retrieve/test_hierarchical_retriever_rerank.py b/tests/retrieve/test_hierarchical_retriever_rerank.py index ffaea6a8..f72682b3 100644 --- a/tests/retrieve/test_hierarchical_retriever_rerank.py +++ b/tests/retrieve/test_hierarchical_retriever_rerank.py @@ -180,8 +180,8 @@ def test_merge_starting_points_prefers_rerank_scores_in_thinking_mode(monkeypatc "hello", ["viking://resources"], [ - {"uri": "viking://resources/root-a", "abstract": "root A", "_score": 0.2}, - {"uri": "viking://resources/root-b", "abstract": "root B", "_score": 0.8}, + {"uri": "viking://resources/root-a", "abstract": "root A", "_score": 0.2, "level": 1}, + {"uri": "viking://resources/root-b", "abstract": "root B", "_score": 0.8, "level": 1}, ], mode=RetrieverMode.THINKING, ) diff --git a/tests/server/conftest.py b/tests/server/conftest.py index 627798b4..78dbb63e 100644 --- a/tests/server/conftest.py +++ b/tests/server/conftest.py @@ -20,8 +20,10 @@ from openviking.server.config import ServerConfig from openviking.server.identity import RequestContext, Role from openviking.service.core import OpenVikingService +from openviking.storage.transaction import reset_transaction_manager from openviking_cli.session.user_id import UserIdentifier from openviking_cli.utils.config.embedding_config import EmbeddingConfig +from openviking_cli.utils.config.vlm_config import VLMConfig # --------------------------------------------------------------------------- # Paths @@ -67,6 +69,20 @@ def get_dimension(self) -> int: return FakeEmbedder +def _install_fake_vlm(monkeypatch): + """Use a fake VLM so server tests never hit external LLM APIs.""" + + async def _fake_get_completion(self, prompt, thinking=False, max_retries=0): + return "# Test Summary\n\nFake summary for testing.\n\n## Details\nTest content." + + async def _fake_get_vision_completion(self, prompt, images, thinking=False): + return "Fake image description for testing." + + monkeypatch.setattr(VLMConfig, "is_available", lambda self: True) + monkeypatch.setattr(VLMConfig, "get_completion_async", _fake_get_completion) + monkeypatch.setattr(VLMConfig, "get_vision_completion_async", _fake_get_vision_completion) + + # --------------------------------------------------------------------------- # Core fixtures: service + app + async client (HTTP API tests, in-process) # --------------------------------------------------------------------------- @@ -94,7 +110,9 @@ def sample_markdown_file(temp_dir: Path) -> Path: @pytest_asyncio.fixture(scope="function") async def service(temp_dir: Path, monkeypatch): """Create and initialize an OpenVikingService in embedded mode.""" + reset_transaction_manager() fake_embedder_cls = _install_fake_embedder(monkeypatch) + _install_fake_vlm(monkeypatch) svc = OpenVikingService( path=str(temp_dir / "data"), user=UserIdentifier.the_default_user("test_user") ) @@ -102,6 +120,7 @@ async def service(temp_dir: Path, monkeypatch): svc.viking_fs.query_embedder = fake_embedder_cls() yield svc await svc.close() + reset_transaction_manager() @pytest_asyncio.fixture(scope="function") @@ -146,7 +165,9 @@ async def client_with_resource(client, service, sample_markdown_file): async def running_server(temp_dir: Path, monkeypatch): """Start a real uvicorn server in a background thread.""" await AsyncOpenViking.reset() + reset_transaction_manager() fake_embedder_cls = _install_fake_embedder(monkeypatch) + _install_fake_vlm(monkeypatch) svc = OpenVikingService( path=str(temp_dir / "sdk_data"), user=UserIdentifier.the_default_user("sdk_test_user") diff --git a/tests/transaction/test_context_manager.py b/tests/transaction/test_context_manager.py index f45a55cc..bf077bf9 100644 --- a/tests/transaction/test_context_manager.py +++ b/tests/transaction/test_context_manager.py @@ -87,7 +87,9 @@ async def test_mv_lock_mode(self): ) as tx: await tx.commit() - tx_manager.acquire_lock_mv.assert_called_once_with("tx-test", "/src", "/dst") + tx_manager.acquire_lock_mv.assert_called_once_with( + "tx-test", "/src", "/dst", src_is_dir=True + ) async def test_point_lock_mode(self): tx_manager, record = _make_tx_manager() diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py index 85384574..a8e3d993 100644 --- a/tests/transaction/test_crash_recovery.py +++ b/tests/transaction/test_crash_recovery.py @@ -276,12 +276,12 @@ async def test_recover_init_orphan_lock_owned_by_other_tx_not_removed(self): assert not any(".path.ovlock" in p for p in rm_calls) manager._journal.delete.assert_called_once_with("tx-innocent") - async def test_recover_aquire_status(self): - """AQUIRE status → same as INIT, clean up only.""" + async def test_recover_acquire_status(self): + """ACQUIRE status → same as INIT, clean up only.""" entries = { "tx-acq": { "id": "tx-acq", - "status": "AQUIRE", + "status": "ACQUIRE", "locks": ["/local/z/.path.ovlock"], "created_at": time.time(), "updated_at": time.time(), diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py index d498fb59..d7b850c4 100644 --- a/tests/transaction/test_e2e.py +++ b/tests/transaction/test_e2e.py @@ -128,7 +128,7 @@ async def test_no_commit_triggers_rollback(self, agfs_client, tx_manager, test_d class TestE2EMvLock: async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_dir): - """mv lock mode acquires SUBTREE on source and POINT on destination.""" + """mv lock mode acquires SUBTREE on both source and destination.""" src = f"{test_dir}/mv-src-{uuid.uuid4().hex}" dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}" agfs_client.mkdir(src) @@ -144,7 +144,7 @@ async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_d dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token assert ":S" in src_token_str # SUBTREE on source - assert ":P" in dst_token_str # POINT on destination + assert ":S" in dst_token_str # SUBTREE on destination await tx.commit() diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py index e9af3fdc..2f3b6afc 100644 --- a/tests/transaction/test_path_lock.py +++ b/tests/transaction/test_path_lock.py @@ -181,8 +181,8 @@ async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir): await lock.release(tx_child) - async def test_acquire_mv_creates_subtree_and_point(self, agfs_client, test_dir): - """acquire_mv puts SUBTREE on src and POINT on dst.""" + async def test_acquire_mv_creates_subtree_locks(self, agfs_client, test_dir): + """acquire_mv puts SUBTREE on both src and dst.""" import uuid as _uuid src = f"{test_dir}/src-{_uuid.uuid4().hex}" @@ -209,7 +209,7 @@ async def test_acquire_mv_creates_subtree_and_point(self, agfs_client, test_dir) if isinstance(dst_token_bytes, bytes) else dst_token_bytes ) - assert ":P" in dst_token + assert ":S" in dst_token await lock.release(tx) diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py index 3d6fd198..ef0f0b3e 100644 --- a/tests/transaction/test_transaction_manager.py +++ b/tests/transaction/test_transaction_manager.py @@ -67,7 +67,7 @@ async def test_begin_updates_status(self): tx = manager.create_transaction() ok = await manager.begin(tx.id) assert ok is True - assert tx.status == TransactionStatus.AQUIRE + assert tx.status == TransactionStatus.ACQUIRE async def test_begin_unknown_tx(self): manager, _ = _make_manager() From 5a9ffb52dcc46f2cd5142b23e75c2122c962a834 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 15:08:07 +0800 Subject: [PATCH 05/18] fix(transaction): fix rollback and race condition bugs - Reconstruct RequestContext from undo params for vectordb_delete/update_uri rollback (previously skipped silently due to missing ctx) - Serialize ctx fields into undo params in rm/mv operations - Fix Phase 1 undo path to target archive dir instead of session root - Remove Phase 2 fs_write_new undo (overwrites are idempotent, checkpoint handles recovery) - Add ancestor SUBTREE recheck after lock creation in acquire_subtree - Move _collect_uris inside TransactionContext in rm/mv to close race window - Log journal persistence failures instead of silently swallowing Co-Authored-By: Claude Opus 4.6 --- openviking/session/session.py | 6 +- .../storage/transaction/context_manager.py | 8 +-- openviking/storage/transaction/path_lock.py | 2 + openviking/storage/transaction/undo.py | 61 ++++++++++++++----- openviking/storage/viking_fs.py | 34 +++++++++-- 5 files changed, 84 insertions(+), 27 deletions(-) diff --git a/openviking/session/session.py b/openviking/session/session.py index 067d0e1b..005b2f57 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -336,7 +336,9 @@ async def _phase1_archive_async( async with TransactionContext( tx_manager, "session_archive", [session_path], lock_mode="point" ) as tx: - seq = tx.record_undo("fs_write_new", {"uri": session_path}) + archive_uri = f"{self._session_uri}/history/archive_{compression_index:03d}" + archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx) + seq = tx.record_undo("fs_write_new", {"uri": archive_path}) self._write_archive( index=compression_index, messages=messages_to_archive, @@ -355,11 +357,9 @@ async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None async with TransactionContext( tx_manager, "session_memory", [session_path], lock_mode="point" ) as tx: - seq = tx.record_undo("fs_write_new", {"uri": session_path}) self._write_to_agfs(self._messages) self._write_relations() self._write_checkpoint({"status": "completed"}) - tx.mark_completed(seq) tx.add_post_action( "enqueue_semantic", { diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py index 5d63658e..8272b91c 100644 --- a/openviking/storage/transaction/context_manager.py +++ b/openviking/storage/transaction/context_manager.py @@ -125,8 +125,8 @@ def record_undo(self, op_type: str, params: Dict[str, Any]) -> int: try: self._tx_manager.journal.update(self.record.to_journal()) - except Exception: - pass + except Exception as e: + logger.debug(f"[Transaction] Failed to persist journal: {e}") return seq @@ -138,8 +138,8 @@ def mark_completed(self, sequence: int) -> None: try: self._tx_manager.journal.update(self.record.to_journal()) - except Exception: - pass + except Exception as e: + logger.debug(f"[Transaction] Failed to persist journal: {e}") def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None: self.record.post_actions.append({"type": action_type, "params": params}) diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index 856a288c..a67cb6bc 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -323,6 +323,8 @@ async def acquire_subtree( backed_off = False conflict_after = await self._scan_descendants_for_locks(path, transaction_id) + if not conflict_after: + conflict_after = await self._check_ancestors_for_subtree(path, transaction_id) if conflict_after: their_token = self._read_token(conflict_after) if their_token: diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py index d64d1619..e11575ad 100644 --- a/openviking/storage/transaction/undo.py +++ b/openviking/storage/transaction/undo.py @@ -15,6 +15,29 @@ logger = get_logger(__name__) +def _reconstruct_ctx(params: Dict[str, Any]) -> Optional[Any]: + """Reconstruct a RequestContext from serialized _ctx_* fields in undo params. + + Returns None if the required fields are missing. + """ + account_id = params.get("_ctx_account_id") + user_id = params.get("_ctx_user_id") + agent_id = params.get("_ctx_agent_id") + role_value = params.get("_ctx_role") + if account_id is None or user_id is None: + return None + try: + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + + role = Role(role_value) if role_value in {r.value for r in Role} else Role.ROOT + user = UserIdentifier(account_id, user_id, agent_id or "") + return RequestContext(user=user, role=role) + except Exception as e: + logger.warning(f"[Rollback] Failed to reconstruct ctx: {e}") + return None + + @dataclass class UndoEntry: """A single undo log entry representing one reversible sub-operation. @@ -124,24 +147,32 @@ def _rollback_entry( run_async(vector_store.delete([record_id])) elif op == "vectordb_delete": - if vector_store and ctx: - records_snapshot = params.get("records_snapshot", []) - for record in records_snapshot: - try: - run_async(vector_store.upsert(record)) - except Exception as e: - logger.warning(f"[Rollback] Failed to restore vector record: {e}") + if vector_store: + restored_ctx = _reconstruct_ctx(params) + if restored_ctx is None: + logger.warning("[Rollback] vectordb_delete: cannot reconstruct ctx, skipping") + else: + records_snapshot = params.get("records_snapshot", []) + for record in records_snapshot: + try: + run_async(vector_store.upsert(record, ctx=restored_ctx)) + except Exception as e: + logger.warning(f"[Rollback] Failed to restore vector record: {e}") elif op == "vectordb_update_uri": - if vector_store and ctx: - run_async( - vector_store.update_uri_mapping( - ctx=ctx, - uri=params["new_uri"], - new_uri=params["old_uri"], - new_parent_uri=params.get("old_parent_uri", ""), + if vector_store: + restored_ctx = _reconstruct_ctx(params) + if restored_ctx is None: + logger.warning("[Rollback] vectordb_update_uri: cannot reconstruct ctx, skipping") + else: + run_async( + vector_store.update_uri_mapping( + ctx=restored_ctx, + uri=params["new_uri"], + new_uri=params["old_uri"], + new_parent_uri=params.get("old_parent_uri", ""), + ) ) - ) else: logger.warning(f"[Rollback] Unknown op_type: {op}") diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 56621f60..71ca1b74 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -298,12 +298,12 @@ async def rm( self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) target_uri = self._path_to_uri(path, ctx=ctx) - uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) - uris_to_delete.append(target_uri) tx_manager = get_transaction_manager() if not tx_manager: # Fallback: no transaction support + uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) + uris_to_delete.append(target_uri) result = self.agfs.rm(path, recursive=recursive) await self._delete_from_vector_store(uris_to_delete, ctx=ctx) return result @@ -314,6 +314,8 @@ async def rm( is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False except Exception: # Path does not exist: clean up any orphan index records and return + uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) + uris_to_delete.append(target_uri) await self._delete_from_vector_store(uris_to_delete, ctx=ctx) logger.info(f"[VikingFS] rm target not found, cleaned orphan index: {uri}") return {} @@ -327,12 +329,25 @@ async def rm( lock_mode = "point" async with TransactionContext(tx_manager, "rm", lock_paths, lock_mode=lock_mode) as tx: + # Collect URIs inside the lock to avoid race conditions + uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) + uris_to_delete.append(target_uri) + # Snapshot vector records for rollback records_snapshot = await self._snapshot_vector_records(uris_to_delete, ctx=ctx) # Step 1: Delete from VectorDB first + real_ctx = self._ctx_or_default(ctx) seq_vdb = tx.record_undo( - "vectordb_delete", {"uris": uris_to_delete, "records_snapshot": records_snapshot} + "vectordb_delete", + { + "uris": uris_to_delete, + "records_snapshot": records_snapshot, + "_ctx_account_id": real_ctx.account_id, + "_ctx_user_id": real_ctx.user.user_id, + "_ctx_agent_id": real_ctx.user.agent_id, + "_ctx_role": real_ctx.role.value, + }, ) await self._delete_from_vector_store(uris_to_delete, ctx=ctx) tx.mark_completed(seq_vdb) @@ -364,12 +379,12 @@ async def mv( old_path = self._uri_to_path(old_uri, ctx=ctx) new_path = self._uri_to_path(new_uri, ctx=ctx) target_uri = self._path_to_uri(old_path, ctx=ctx) - uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) - uris_to_move.append(target_uri) tx_manager = get_transaction_manager() if not tx_manager: # Fallback: no transaction support + uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) + uris_to_move.append(target_uri) try: result = self.agfs.mv(old_path, new_path) await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) @@ -397,6 +412,10 @@ async def mv( mv_dst_path=dst_parent, src_is_dir=is_dir, ) as tx: + # Collect URIs inside the lock to avoid race conditions + uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) + uris_to_move.append(target_uri) + # Step 1: Copy source to destination seq_cp = tx.record_undo("fs_write_new", {"uri": new_path}) try: @@ -421,6 +440,7 @@ async def mv( old_parent_uri = ( old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else "" ) + real_ctx = self._ctx_or_default(ctx) seq_vdb = tx.record_undo( "vectordb_update_uri", { @@ -428,6 +448,10 @@ async def mv( "new_uri": new_uri, "old_parent_uri": old_parent_uri, "uris": uris_to_move, + "_ctx_account_id": real_ctx.account_id, + "_ctx_user_id": real_ctx.user.user_id, + "_ctx_agent_id": real_ctx.user.agent_id, + "_ctx_role": real_ctx.role.value, }, ) await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) From 273efbc408f2ef0d3fb902c4a0334ff74ff17f59 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 15:38:50 +0800 Subject: [PATCH 06/18] refactor(transaction): make TransactionManager required and rewrite tests with real backends Remove all optional/fallback code paths where tx_manager could be None. get_transaction_manager() now raises RuntimeError if not initialized. Fix undo rollback to reconstruct ctx for vectordb_upsert and use correct agent_id default. Replace mock-based transaction tests with integration tests using real AGFS and VectorDB backends. --- openviking/parse/tree_builder.py | 59 +- openviking/service/debug_service.py | 7 +- openviking/session/session.py | 33 +- .../transaction/transaction_manager.py | 12 +- openviking/storage/transaction/undo.py | 8 +- openviking/storage/viking_fs.py | 23 +- tests/integration/test_add_resource_index.py | 1 - tests/transaction/conftest.py | 86 +- tests/transaction/test_crash_recovery.py | 854 +++++++++++------- tests/transaction/test_rm_rollback.py | 363 ++++---- tests/transaction/test_undo.py | 260 ++++-- 11 files changed, 1035 insertions(+), 671 deletions(-) diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 97935bdb..721c12bc 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -176,45 +176,32 @@ async def finalize_from_temp( # Lock parent directory (final_path doesn't exist yet) parent_path = final_path.rsplit("/", 1)[0] if "/" in final_path else final_path - if tx_manager: - # Ensure parent directories exist before locking - await self._ensure_parent_dirs(final_uri, ctx=ctx) - - async with TransactionContext( - tx_manager, "finalize_from_temp", [parent_path], lock_mode="point" - ) as tx: - # Move temp to final - seq = tx.record_undo("fs_write_new", {"uri": final_path}) - await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx) - tx.mark_completed(seq) - logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}") - - # Register semantic enqueue as post_action - tx.add_post_action( - "enqueue_semantic", - { - "uri": final_uri, - "context_type": "resource", - "account_id": ctx.account_id, - "user_id": ctx.user.user_id, - "agent_id": ctx.user.agent_id, - "role": ctx.role.value, - }, - ) - - await tx.commit() - else: - # Fallback: no transaction support + # Ensure parent directories exist before locking + await self._ensure_parent_dirs(final_uri, ctx=ctx) + + async with TransactionContext( + tx_manager, "finalize_from_temp", [parent_path], lock_mode="point" + ) as tx: + # Move temp to final + seq = tx.record_undo("fs_write_new", {"uri": final_path}) await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx) + tx.mark_completed(seq) logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}") - try: - await self._enqueue_semantic_generation(final_uri, "resource", ctx=ctx) - logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}") - except Exception as e: - logger.error( - f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True - ) + # Register semantic enqueue as post_action + tx.add_post_action( + "enqueue_semantic", + { + "uri": final_uri, + "context_type": "resource", + "account_id": ctx.account_id, + "user_id": ctx.user.user_id, + "agent_id": ctx.user.agent_id, + "role": ctx.role.value, + }, + ) + + await tx.commit() # 5. Cleanup temporary root directory try: diff --git a/openviking/service/debug_service.py b/openviking/service/debug_service.py index 9c3cf39b..7dffff65 100644 --- a/openviking/service/debug_service.py +++ b/openviking/service/debug_service.py @@ -138,13 +138,14 @@ def vlm(self) -> ComponentStatus: @property def transaction(self) -> ComponentStatus: """Get transaction status.""" - transaction_manager = get_transaction_manager() - if transaction_manager is None: + try: + transaction_manager = get_transaction_manager() + except Exception: return ComponentStatus( name="transaction", is_healthy=False, has_errors=True, - status="Transaction manager not initialized.", + status="Not initialized", ) observer = TransactionObserver(transaction_manager) return ComponentStatus( diff --git a/openviking/session/session.py b/openviking/session/session.py index 005b2f57..88444adb 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -251,25 +251,16 @@ def commit(self) -> Dict[str, Any]: archive_abstract = self._extract_abstract_from_summary(summary) archive_overview = summary - if tx_manager: - run_async( - self._phase1_archive_async( - tx_manager, - session_path, - self._compression.compression_index, - messages_to_archive, - archive_abstract, - archive_overview, - ) - ) - else: - self._write_archive( - index=self._compression.compression_index, - messages=messages_to_archive, - abstract=archive_abstract, - overview=archive_overview, + run_async( + self._phase1_archive_async( + tx_manager, + session_path, + self._compression.compression_index, + messages_to_archive, + archive_abstract, + archive_overview, ) - self._write_to_agfs(messages=[]) + ) self._compression.original_count += len(messages_to_archive) result["archived"] = True @@ -298,11 +289,7 @@ def commit(self) -> Dict[str, Any]: get_current_telemetry().set("memory.extracted", len(memories)) # ===== Phase 2: Memory write ===== - if tx_manager: - run_async(self._phase2_memory_async(tx_manager, session_path)) - else: - self._write_to_agfs(self._messages) - self._write_relations() + run_async(self._phase2_memory_async(tx_manager, session_path)) # Update active_count active_count_updated = self._update_active_counts() diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index 9cd0cd9f..7b40c6be 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -629,12 +629,12 @@ def init_transaction_manager( return _transaction_manager -def get_transaction_manager() -> Optional[TransactionManager]: - """Get transaction manager singleton. - - Returns: - TransactionManager instance or None if not initialized - """ +def get_transaction_manager() -> TransactionManager: + """Get transaction manager singleton.""" + if _transaction_manager is None: + raise RuntimeError( + "TransactionManager not initialized. Call init_transaction_manager() first." + ) return _transaction_manager diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py index e11575ad..a77aa5aa 100644 --- a/openviking/storage/transaction/undo.py +++ b/openviking/storage/transaction/undo.py @@ -31,7 +31,7 @@ def _reconstruct_ctx(params: Dict[str, Any]) -> Optional[Any]: from openviking_cli.session.user_id import UserIdentifier role = Role(role_value) if role_value in {r.value for r in Role} else Role.ROOT - user = UserIdentifier(account_id, user_id, agent_id or "") + user = UserIdentifier(account_id, user_id, agent_id or "default") return RequestContext(user=user, role=role) except Exception as e: logger.warning(f"[Rollback] Failed to reconstruct ctx: {e}") @@ -144,7 +144,11 @@ def _rollback_entry( if vector_store: record_id = params.get("record_id") if record_id: - run_async(vector_store.delete([record_id])) + restored_ctx = _reconstruct_ctx(params) + if restored_ctx: + run_async(vector_store.delete([record_id], ctx=restored_ctx)) + else: + logger.warning("[Rollback] vectordb_upsert: cannot reconstruct ctx, skipping") elif op == "vectordb_delete": if vector_store: diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 71ca1b74..4db5afd7 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -22,7 +22,6 @@ from pathlib import PurePath from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union -from openviking.pyagfs.exceptions import AGFSHTTPError from openviking.server.identity import RequestContext, Role from openviking.telemetry import get_current_telemetry from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime @@ -300,13 +299,6 @@ async def rm( target_uri = self._path_to_uri(path, ctx=ctx) tx_manager = get_transaction_manager() - if not tx_manager: - # Fallback: no transaction support - uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) - uris_to_delete.append(target_uri) - result = self.agfs.rm(path, recursive=recursive) - await self._delete_from_vector_store(uris_to_delete, ctx=ctx) - return result # Check existence and determine lock strategy try: @@ -381,19 +373,6 @@ async def mv( target_uri = self._path_to_uri(old_path, ctx=ctx) tx_manager = get_transaction_manager() - if not tx_manager: - # Fallback: no transaction support - uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) - uris_to_move.append(target_uri) - try: - result = self.agfs.mv(old_path, new_path) - await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) - return result - except AGFSHTTPError as e: - if e.status_code == 404: - await self._delete_from_vector_store(uris_to_move, ctx=ctx) - logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") - raise # Verify source exists and determine type before locking try: @@ -1228,9 +1207,9 @@ async def _snapshot_vector_records( for uri in uris: try: records = await vector_store.get_context_by_uri( - account_id=real_ctx.account_id, uri=uri, limit=10, + ctx=real_ctx, ) if records: snapshots.extend(records) diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py index 27d6e234..2a35462a 100644 --- a/tests/integration/test_add_resource_index.py +++ b/tests/integration/test_add_resource_index.py @@ -94,7 +94,6 @@ async def test_add_resource_indexing_logic(test_config, tmp_path): patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs), patch("openviking.agfs_manager.AGFSManager.start"), patch("openviking.agfs_manager.AGFSManager.stop"), - patch("openviking.storage.transaction.get_transaction_manager", return_value=None), ): mock_summarize.return_value = {"status": "success"} diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py index db77bbdd..05fac402 100644 --- a/tests/transaction/conftest.py +++ b/tests/transaction/conftest.py @@ -1,6 +1,6 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 -"""Shared fixtures for transaction tests using real AGFS backend.""" +"""Shared fixtures for transaction tests using real AGFS and VectorDB backends.""" import os import shutil @@ -9,13 +9,24 @@ import pytest from openviking.agfs_manager import AGFSManager +from openviking.server.identity import RequestContext, Role +from openviking.storage.collection_schemas import CollectionSchemas +from openviking.storage.transaction.journal import TransactionJournal +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _make_fencing_token +from openviking.storage.transaction.transaction_manager import TransactionManager +from openviking.storage.viking_vector_index_backend import VikingVectorIndexBackend from openviking.utils.agfs_utils import create_agfs_client +from openviking_cli.session.user_id import UserIdentifier from openviking_cli.utils.config.agfs_config import AGFSConfig +from openviking_cli.utils.config.vectordb_config import VectorDBBackendConfig AGFS_CONF = AGFSConfig( path="/tmp/ov-tx-test", backend="local", port=1834, url="http://localhost:1834", timeout=10 ) +VECTOR_DIM = 4 +COLLECTION_NAME = "tx_test_ctx" + # Clean slate before session starts if os.path.exists(AGFS_CONF.path): shutil.rmtree(AGFS_CONF.path) @@ -54,3 +65,76 @@ def test_dir(agfs_client): agfs_client.rm(path, recursive=True) except Exception: pass + + +# --------------------------------------------------------------------------- +# VectorDB fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def vector_store(tmp_path_factory): + """Session-scoped real local VectorDB backend.""" + db_path = str(tmp_path_factory.mktemp("vectordb")) + config = VectorDBBackendConfig( + backend="local", + name=COLLECTION_NAME, + path=db_path, + dimension=VECTOR_DIM, + ) + store = VikingVectorIndexBackend(config=config) + + import asyncio + + schema = CollectionSchemas.context_collection(COLLECTION_NAME, VECTOR_DIM) + asyncio.get_event_loop().run_until_complete(store.create_collection(COLLECTION_NAME, schema)) + + yield store + + asyncio.get_event_loop().run_until_complete(store.close()) + + +@pytest.fixture(scope="session") +def request_ctx(): + """Session-scoped RequestContext for VectorDB operations.""" + user = UserIdentifier("default", "test_user", "default") + return RequestContext(user=user, role=Role.ROOT) + + +# --------------------------------------------------------------------------- +# Transaction fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tx_manager(agfs_client, vector_store): + """Function-scoped TransactionManager with real backends.""" + return TransactionManager(agfs_client=agfs_client, vector_store=vector_store) + + +@pytest.fixture +def journal(agfs_client): + """Function-scoped TransactionJournal with real AGFS backend.""" + return TransactionJournal(agfs_client) + + +# --------------------------------------------------------------------------- +# Utility helpers +# --------------------------------------------------------------------------- + + +def file_exists(agfs_client, path) -> bool: + """Check if a file/dir exists in AGFS.""" + try: + agfs_client.stat(path) + return True + except Exception: + return False + + +def make_lock_file(agfs_client, dir_path, tx_id, lock_type="P") -> str: + """Create a real lock file in AGFS and return its path.""" + lock_path = f"{dir_path.rstrip('/')}/{LOCK_FILE_NAME}" + token = _make_fencing_token(tx_id, lock_type) + agfs_client.write(lock_path, token.encode("utf-8")) + return lock_path diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py index a8e3d993..21569edd 100644 --- a/tests/transaction/test_crash_recovery.py +++ b/tests/transaction/test_crash_recovery.py @@ -1,385 +1,561 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 -"""Integration test: crash recovery from journal.""" +"""Integration test: crash recovery from journal using real AGFS and VectorDB backends.""" -import time -from unittest.mock import AsyncMock, MagicMock, patch +import uuid +from unittest.mock import AsyncMock, patch +from openviking.storage.transaction.journal import TransactionJournal from openviking.storage.transaction.transaction_manager import TransactionManager +from openviking.storage.transaction.transaction_record import ( + TransactionRecord, + TransactionStatus, +) +from openviking.storage.transaction.undo import UndoEntry + +from .conftest import VECTOR_DIM, _mkdir_ok, file_exists, make_lock_file + + +def _write_journal(journal, record): + """Write a TransactionRecord to real journal storage.""" + journal.write(record.to_journal()) class TestCrashRecovery: - def _make_manager(self, journal_entries=None): - """Create a TransactionManager with mocked AGFS and journal data.""" - agfs = MagicMock() - manager = TransactionManager(agfs_client=agfs, timeout=3600) - - if journal_entries: - manager._journal = MagicMock() - manager._journal.list_all.return_value = list(journal_entries.keys()) - manager._journal.read.side_effect = lambda tx_id: journal_entries[tx_id] - manager._journal.delete = MagicMock() - else: - manager._journal = MagicMock() - manager._journal.list_all.return_value = [] - - return manager, agfs - - async def test_recover_committed_with_post_actions(self): - """COMMIT + post_actions → replay post_actions, clean up.""" - entries = { - "tx-1": { - "id": "tx-1", - "status": "COMMIT", - "locks": ["/local/test/.path.ovlock"], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [ - { - "type": "enqueue_semantic", - "params": { - "uri": "viking://test", - "context_type": "resource", - "account_id": "acc", - }, - } - ], - } - } - manager, agfs = self._make_manager(entries) + """ + Core technique: simulate crash recovery. + + 1. Create real FS state via agfs_client + 2. Build TransactionRecord, write to real journal + 3. Create fresh TransactionManager (simulates process restart) + 4. Call manager._recover_pending_transactions() + 5. Verify final state via agfs_client.stat()/cat() and vector_store.get() + """ + + async def test_recover_commit_no_rollback(self, agfs_client, vector_store, test_dir): + """COMMIT status → committed files NOT rolled back, journal cleaned up.""" + # Create a file that was part of a committed transaction + committed_file = f"{test_dir}/committed.txt" + agfs_client.write(committed_file, b"committed data") + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-commit-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.COMMIT, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_write_new", + params={"uri": committed_file}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + # New manager (simulates restart) + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) + await manager._recover_pending_transactions() - with patch( - "openviking.storage.transaction.transaction_manager.TransactionManager._execute_post_actions", - new_callable=AsyncMock, - ) as mock_post: + # File should still exist (no rollback for committed tx) + assert file_exists(agfs_client, committed_file) + # Journal should be cleaned up + assert tx_id not in journal.list_all() + + async def test_recover_commit_replays_post_actions(self, agfs_client, vector_store, test_dir): + """COMMIT + post_actions → replay post_actions.""" + journal = TransactionJournal(agfs_client) + tx_id = f"tx-post-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.COMMIT, + locks=[], + undo_log=[], + post_actions=[ + { + "type": "enqueue_semantic", + "params": { + "uri": "viking://test-post", + "context_type": "resource", + "account_id": "acc", + }, + } + ], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) + + with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post: await manager._recover_pending_transactions() mock_post.assert_called_once() - agfs.rm.assert_called_once_with("/local/test/.path.ovlock") - manager._journal.delete.assert_called_once_with("tx-1") - - async def test_recover_committed_no_post_actions(self): - """COMMIT + no post_actions → just clean up, no rollback.""" - entries = { - "tx-2": { - "id": "tx-2", - "status": "COMMIT", - "locks": [], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [ - # Even if undo_log has entries, COMMIT should NOT rollback - { - "sequence": 0, - "op_type": "fs_mv", - "params": {"src": "/a", "dst": "/b"}, - "completed": True, - } - ], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + assert tx_id not in journal.list_all() + + async def test_recover_exec_rollback_fs_mv(self, agfs_client, vector_store, test_dir): + """EXEC status with fs_mv → recovery rolls back → file moved back.""" + src = f"{test_dir}/exec-mv-src" + dst = f"{test_dir}/exec-mv-dst" + _mkdir_ok(agfs_client, src) + agfs_client.write(f"{src}/data.txt", b"mv-data") + + # Simulate: forward mv happened, then crash + agfs_client.mv(src, dst) + assert not file_exists(agfs_client, src) + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-exec-mv-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mv", + params={"src": src, "dst": dst}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - agfs.mv.assert_not_called() # No rollback for committed transactions - manager._journal.delete.assert_called_once_with("tx-2") - - async def test_recover_exec_triggers_rollback(self): - """EXEC status → execute rollback regardless of transaction age.""" - entries = { - "tx-3": { - "id": "tx-3", - "status": "EXEC", - "locks": ["/local/x/.path.ovlock"], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [ - { - "sequence": 0, - "op_type": "fs_mv", - "params": {"src": "/local/a", "dst": "/local/b"}, - "completed": True, - } - ], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + assert file_exists(agfs_client, src) + assert not file_exists(agfs_client, dst) + assert tx_id not in journal.list_all() + + async def test_recover_exec_rollback_fs_mkdir(self, agfs_client, vector_store, test_dir): + """EXEC with fs_mkdir → recovery → directory removed.""" + new_dir = f"{test_dir}/exec-mkdir" + _mkdir_ok(agfs_client, new_dir) + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-exec-mkdir-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": new_dir}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - agfs.mv.assert_called_once_with("/local/b", "/local/a") - manager._journal.delete.assert_called_once_with("tx-3") - - async def test_recover_fail_triggers_rollback(self): - """FAIL status → execute rollback.""" - entries = { - "tx-fail": { - "id": "tx-fail", - "status": "FAIL", - "locks": [], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [ - { - "sequence": 0, - "op_type": "fs_mkdir", - "params": {"uri": "/local/newdir"}, - "completed": True, - } - ], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + assert not file_exists(agfs_client, new_dir) + assert tx_id not in journal.list_all() + + async def test_recover_exec_rollback_fs_write_new(self, agfs_client, vector_store, test_dir): + """EXEC with fs_write_new → recovery → file removed.""" + file_path = f"{test_dir}/exec-write.txt" + agfs_client.write(file_path, b"to-be-rolled-back") + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-exec-write-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_write_new", + params={"uri": file_path}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - agfs.rm.assert_called_once_with("/local/newdir") - manager._journal.delete.assert_called_once_with("tx-fail") - - async def test_recover_exec_recover_all_includes_incomplete(self): - """EXEC recovery uses recover_all=True: also reverses incomplete entries.""" - entries = { - "tx-partial": { - "id": "tx-partial", - "status": "EXEC", - "locks": [], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [ - { - "sequence": 0, - "op_type": "fs_mv", - "params": {"src": "/local/a", "dst": "/local/b"}, - "completed": False, # not completed, but recover_all=True should still reverse it - } - ], - "post_actions": [], - } + assert not file_exists(agfs_client, file_path) + assert tx_id not in journal.list_all() + + async def test_recover_exec_rollback_vectordb_upsert( + self, agfs_client, vector_store, request_ctx, test_dir + ): + """EXEC with vectordb_upsert → recovery → record deleted from VectorDB.""" + record_id = str(uuid.uuid4()) + record = { + "id": record_id, + "uri": f"viking://resources/crash-upsert-{record_id}.md", + "parent_uri": "viking://resources/", + "account_id": "default", + "context_type": "resource", + "level": 2, + "vector": [0.5] * VECTOR_DIM, + "name": "crash-upsert", + "description": "test", + "abstract": "test", } - manager, agfs = self._make_manager(entries) + await vector_store.upsert(record, ctx=request_ctx) + assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1 + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-exec-vdb-{uuid.uuid4().hex[:8]}" + tx_record = TransactionRecord( + id=tx_id, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="vectordb_upsert", + params={ + "record_id": record_id, + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, tx_record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - agfs.mv.assert_called_once_with("/local/b", "/local/a") - manager._journal.delete.assert_called_once_with("tx-partial") - - async def test_recover_init_just_cleans_up(self): - """INIT status → no rollback (nothing executed), just release locks and clean journal.""" - entries = { - "tx-4": { - "id": "tx-4", - "status": "INIT", - "locks": ["/local/y/.path.ovlock"], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + results = await vector_store.get([record_id], ctx=request_ctx) + assert len(results) == 0 + assert tx_id not in journal.list_all() + + async def test_recover_fail_triggers_rollback(self, agfs_client, vector_store, test_dir): + """FAIL status → also triggers rollback.""" + new_dir = f"{test_dir}/fail-dir" + _mkdir_ok(agfs_client, new_dir) + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-fail-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.FAIL, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": new_dir}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - agfs.rm.assert_called_once_with("/local/y/.path.ovlock") - manager._journal.delete.assert_called_once_with("tx-4") - - async def test_recover_multiple_transactions(self): - """Multiple journals are all recovered.""" - entries = { - "tx-a": { - "id": "tx-a", - "status": "INIT", - "locks": [], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - }, - "tx-b": { - "id": "tx-b", - "status": "COMMIT", - "locks": [], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - }, - } - manager, agfs = self._make_manager(entries) + assert not file_exists(agfs_client, new_dir) + assert tx_id not in journal.list_all() + + async def test_recover_releasing_triggers_rollback(self, agfs_client, vector_store, test_dir): + """RELEASING status → rollback + lock cleanup.""" + new_dir = f"{test_dir}/releasing-dir" + _mkdir_ok(agfs_client, new_dir) + + lock_path = make_lock_file(agfs_client, test_dir, "tx-releasing-placeholder", "S") + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-releasing-{uuid.uuid4().hex[:8]}" + # Rewrite lock with correct tx_id + lock_path = make_lock_file(agfs_client, test_dir, tx_id, "S") + + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.RELEASING, + locks=[lock_path], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": new_dir}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - assert manager._journal.delete.call_count == 2 - - async def test_recover_init_empty_locks_cleans_orphan_via_init_info(self): - """INIT with empty locks but init_info.lock_paths → clean up orphan lock files.""" - entries = { - "tx-orphan": { - "id": "tx-orphan", - "status": "INIT", - "locks": [], # Empty: crash happened before journal recorded locks - "init_info": { - "operation": "rm", - "lock_paths": ["/local/orphan-dir"], - "lock_mode": "subtree", - }, - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) - - # Simulate: the lock file exists and is owned by this transaction - from openviking.storage.transaction.path_lock import _make_fencing_token - - token = _make_fencing_token("tx-orphan", "S") - agfs.cat.return_value = token.encode("utf-8") + assert not file_exists(agfs_client, new_dir) + assert not file_exists(agfs_client, lock_path) + assert tx_id not in journal.list_all() + + async def test_recover_exec_includes_incomplete(self, agfs_client, vector_store, test_dir): + """EXEC recovery uses recover_all=True → also reverses incomplete entries.""" + new_dir = f"{test_dir}/exec-incomplete" + _mkdir_ok(agfs_client, new_dir) + + journal = TransactionJournal(agfs_client) + tx_id = f"tx-exec-inc-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": new_dir}, + completed=False, # incomplete, but recover_all=True reverses it + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - # Should have removed the orphan lock file - agfs.rm.assert_called() - rm_paths = [call[0][0] for call in agfs.rm.call_args_list] - assert any(".path.ovlock" in p for p in rm_paths) - manager._journal.delete.assert_called_once_with("tx-orphan") - - async def test_recover_init_orphan_lock_owned_by_other_tx_not_removed(self): - """INIT with orphan lock path, but lock file owned by a different tx → not removed.""" - entries = { - "tx-innocent": { - "id": "tx-innocent", - "status": "INIT", - "locks": [], - "init_info": { - "operation": "rm", - "lock_paths": ["/local/shared-dir"], - "lock_mode": "subtree", - }, - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + assert not file_exists(agfs_client, new_dir) + assert tx_id not in journal.list_all() + + async def test_recover_init_cleans_locks(self, agfs_client, vector_store, test_dir): + """INIT status → no rollback, just lock cleanup + journal delete.""" + lock_dir = f"{test_dir}/init-lock-dir" + _mkdir_ok(agfs_client, lock_dir) - # Lock file owned by a different transaction - from openviking.storage.transaction.path_lock import _make_fencing_token + tx_id = f"tx-init-{uuid.uuid4().hex[:8]}" + lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P") - token = _make_fencing_token("tx-OTHER-owner", "S") - agfs.cat.return_value = token.encode("utf-8") + journal = TransactionJournal(agfs_client) + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.INIT, + locks=[lock_path], + undo_log=[], + post_actions=[], + ) + _write_journal(journal, record) + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - # rm should NOT be called for the lock file (only journal delete) - rm_calls = [call[0][0] for call in agfs.rm.call_args_list] if agfs.rm.called else [] - assert not any(".path.ovlock" in p for p in rm_calls) - manager._journal.delete.assert_called_once_with("tx-innocent") + assert not file_exists(agfs_client, lock_path) + assert tx_id not in journal.list_all() - async def test_recover_acquire_status(self): + async def test_recover_acquire_cleans_locks(self, agfs_client, vector_store, test_dir): """ACQUIRE status → same as INIT, clean up only.""" - entries = { - "tx-acq": { - "id": "tx-acq", - "status": "ACQUIRE", - "locks": ["/local/z/.path.ovlock"], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + lock_dir = f"{test_dir}/acquire-lock-dir" + _mkdir_ok(agfs_client, lock_dir) + + tx_id = f"tx-acq-{uuid.uuid4().hex[:8]}" + lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P") + + journal = TransactionJournal(agfs_client) + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.ACQUIRE, + locks=[lock_path], + undo_log=[], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - agfs.rm.assert_called_once_with("/local/z/.path.ovlock") - manager._journal.delete.assert_called_once_with("tx-acq") - - async def test_recover_releasing_status_triggers_rollback(self): - """RELEASING status → process crashed while releasing, rollback undo log.""" - entries = { - "tx-rel": { - "id": "tx-rel", - "status": "RELEASING", - "locks": ["/local/r/.path.ovlock"], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [ - { - "sequence": 0, - "op_type": "fs_mkdir", - "params": {"uri": "/local/tmpdir"}, - "completed": True, - } - ], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + assert not file_exists(agfs_client, lock_path) + assert tx_id not in journal.list_all() + + async def test_recover_init_orphan_lock_via_init_info( + self, agfs_client, vector_store, test_dir + ): + """INIT with empty locks but init_info.lock_paths → clean orphan lock owned by tx.""" + orphan_dir = f"{test_dir}/orphan-dir" + _mkdir_ok(agfs_client, orphan_dir) + + tx_id = f"tx-orphan-{uuid.uuid4().hex[:8]}" + lock_path = make_lock_file(agfs_client, orphan_dir, tx_id, "S") + + journal = TransactionJournal(agfs_client) + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.INIT, + locks=[], # Empty — crash happened before journal recorded locks + init_info={ + "operation": "rm", + "lock_paths": [orphan_dir], + "lock_mode": "subtree", + }, + undo_log=[], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - # Should rollback the undo log - rm_paths = [call[0][0] for call in agfs.rm.call_args_list] - assert "/local/tmpdir" in rm_paths - manager._journal.delete.assert_called_once_with("tx-rel") - - async def test_recover_mv_orphan_locks_include_dst(self): - """INIT mv operation with init_info → check both lock_paths and mv_dst_path for orphan locks.""" - entries = { - "tx-mv-orphan": { - "id": "tx-mv-orphan", - "status": "INIT", - "locks": [], - "init_info": { - "operation": "mv", - "lock_paths": ["/local/src-dir"], - "lock_mode": "mv", - "mv_dst_path": "/local/dst-dir", - }, - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - } - } - manager, agfs = self._make_manager(entries) + assert not file_exists(agfs_client, lock_path) + assert tx_id not in journal.list_all() + + async def test_recover_init_orphan_lock_other_owner(self, agfs_client, vector_store, test_dir): + """INIT with orphan lock owned by different tx → not removed.""" + orphan_dir = f"{test_dir}/orphan-other" + _mkdir_ok(agfs_client, orphan_dir) + + other_tx_id = f"tx-OTHER-{uuid.uuid4().hex[:8]}" + lock_path = make_lock_file(agfs_client, orphan_dir, other_tx_id, "S") + + tx_id = f"tx-innocent-{uuid.uuid4().hex[:8]}" + journal = TransactionJournal(agfs_client) + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.INIT, + locks=[], + init_info={ + "operation": "rm", + "lock_paths": [orphan_dir], + "lock_mode": "subtree", + }, + undo_log=[], + post_actions=[], + ) + _write_journal(journal, record) - from openviking.storage.transaction.path_lock import _make_fencing_token + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) + await manager._recover_pending_transactions() - token = _make_fencing_token("tx-mv-orphan", "P") - agfs.cat.return_value = token.encode("utf-8") + # Lock file should still exist — owned by different tx + assert file_exists(agfs_client, lock_path) + assert tx_id not in journal.list_all() + + async def test_recover_mv_orphan_both_paths(self, agfs_client, vector_store, test_dir): + """INIT mv operation → check both lock_paths and mv_dst_path for orphan locks.""" + src_dir = f"{test_dir}/mv-orphan-src" + dst_dir = f"{test_dir}/mv-orphan-dst" + _mkdir_ok(agfs_client, src_dir) + _mkdir_ok(agfs_client, dst_dir) + + tx_id = f"tx-mv-orphan-{uuid.uuid4().hex[:8]}" + src_lock = make_lock_file(agfs_client, src_dir, tx_id, "S") + dst_lock = make_lock_file(agfs_client, dst_dir, tx_id, "P") + + journal = TransactionJournal(agfs_client) + record = TransactionRecord( + id=tx_id, + status=TransactionStatus.INIT, + locks=[], + init_info={ + "operation": "mv", + "lock_paths": [src_dir], + "lock_mode": "mv", + "mv_dst_path": dst_dir, + }, + undo_log=[], + post_actions=[], + ) + _write_journal(journal, record) + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - # Should check both src and dst paths for orphan locks - cat_paths = [call[0][0] for call in agfs.cat.call_args_list] - assert any("src-dir" in p for p in cat_paths) - assert any("dst-dir" in p for p in cat_paths) - - async def test_recover_journal_read_failure_skips_gracefully(self): - """If reading a journal entry fails, skip that tx and continue with others.""" - agfs = MagicMock() - manager = TransactionManager(agfs_client=agfs, timeout=3600) - manager._journal = MagicMock() - manager._journal.list_all.return_value = ["tx-bad", "tx-good"] - - def read_side_effect(tx_id): - if tx_id == "tx-bad": - raise Exception("corrupted journal") - return { - "id": "tx-good", - "status": "INIT", - "locks": [], - "created_at": time.time(), - "updated_at": time.time(), - "undo_log": [], - "post_actions": [], - } - - manager._journal.read.side_effect = read_side_effect - manager._journal.delete = MagicMock() + # Both orphan locks should be cleaned up + assert not file_exists(agfs_client, src_lock) + assert not file_exists(agfs_client, dst_lock) + assert tx_id not in journal.list_all() + + async def test_recover_multiple_transactions(self, agfs_client, vector_store, test_dir): + """Multiple journal entries are all recovered.""" + dir_a = f"{test_dir}/multi-tx-a" + _mkdir_ok(agfs_client, dir_a) + + journal = TransactionJournal(agfs_client) + + # tx-a: EXEC with mkdir → should rollback + tx_a = f"tx-multi-a-{uuid.uuid4().hex[:8]}" + record_a = TransactionRecord( + id=tx_a, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": dir_a}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record_a) + + # tx-b: COMMIT → no rollback, just cleanup + tx_b = f"tx-multi-b-{uuid.uuid4().hex[:8]}" + record_b = TransactionRecord( + id=tx_b, + status=TransactionStatus.COMMIT, + locks=[], + undo_log=[], + post_actions=[], + ) + _write_journal(journal, record_b) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) + await manager._recover_pending_transactions() + assert not file_exists(agfs_client, dir_a) # rolled back + assert tx_a not in journal.list_all() + assert tx_b not in journal.list_all() + + async def test_recover_corrupted_journal_skips(self, agfs_client, vector_store, test_dir): + """Corrupted journal entry → skipped, others still processed.""" + journal = TransactionJournal(agfs_client) + + # Write a corrupted journal entry (invalid JSON) + bad_tx_id = f"tx-bad-{uuid.uuid4().hex[:8]}" + _mkdir_ok(agfs_client, "/local/_system") + _mkdir_ok(agfs_client, "/local/_system/transactions") + bad_dir = f"/local/_system/transactions/{bad_tx_id}" + _mkdir_ok(agfs_client, bad_dir) + agfs_client.write(f"{bad_dir}/journal.json", b"NOT VALID JSON {{{{") + + # Write a good journal entry + good_dir = f"{test_dir}/good-recovery" + _mkdir_ok(agfs_client, good_dir) + + good_tx_id = f"tx-good-{uuid.uuid4().hex[:8]}" + record = TransactionRecord( + id=good_tx_id, + status=TransactionStatus.EXEC, + locks=[], + undo_log=[ + UndoEntry( + sequence=0, + op_type="fs_mkdir", + params={"uri": good_dir}, + completed=True, + ) + ], + post_actions=[], + ) + _write_journal(journal, record) + + manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) await manager._recover_pending_transactions() - # tx-good should still be cleaned up - manager._journal.delete.assert_called_once_with("tx-good") + # Good tx should still be recovered + assert not file_exists(agfs_client, good_dir) + assert good_tx_id not in journal.list_all() diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py index ee28b7e7..68f5e8b4 100644 --- a/tests/transaction/test_rm_rollback.py +++ b/tests/transaction/test_rm_rollback.py @@ -2,232 +2,293 @@ # SPDX-License-Identifier: Apache-2.0 """Integration tests: multi-step rollback covering FS + VectorDB coordination.""" -from unittest.mock import AsyncMock, MagicMock +import uuid from openviking.storage.transaction.undo import UndoEntry, execute_rollback +from .conftest import VECTOR_DIM, _mkdir_ok, file_exists + class TestRmRollback: - def test_vectordb_records_restored_on_fs_failure(self): - """When FS rm fails (incomplete), VectorDB delete is rolled back via snapshot.""" - agfs = MagicMock() - vector_store = AsyncMock() - ctx = MagicMock() + def test_fs_rm_not_reversible(self, agfs_client, test_dir): + """fs_rm is intentionally irreversible: even completed=True is a no-op.""" + path = f"{test_dir}/rm-target" + _mkdir_ok(agfs_client, path) - snapshot = [{"id": "r1", "uri": "viking://a", "content": "data"}] undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_delete", - params={"uris": ["viking://a"], "records_snapshot": snapshot}, - completed=True, # VectorDB delete succeeded - ), - UndoEntry( - sequence=1, - op_type="fs_rm", - params={"uri": "/local/test", "recursive": True}, - completed=False, # FS rm never ran - ), + UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True), ] + execute_rollback(undo_log, agfs_client) - execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + # Directory still exists — fs_rm rollback does nothing + assert file_exists(agfs_client, path) - # Only vectordb_delete (completed=True) is reversed - vector_store.upsert.assert_called_once_with(snapshot[0]) - # fs_rm is incomplete, so it's skipped (also fs_rm is never reversible anyway) - agfs.rm.assert_not_called() - def test_fs_rm_not_reversible_even_when_completed(self): - """fs_rm is intentionally irreversible: even completed=True is skipped.""" - agfs = MagicMock() +class TestMvRollback: + def test_mv_reversed_on_rollback(self, agfs_client, test_dir): + """Real mv → rollback → content back at original location.""" + src = f"{test_dir}/mv-src" + dst = f"{test_dir}/mv-dst" + _mkdir_ok(agfs_client, src) + agfs_client.write(f"{src}/payload.txt", b"important data") + + # Forward mv + agfs_client.mv(src, dst) + assert not file_exists(agfs_client, src) + content = agfs_client.cat(f"{dst}/payload.txt") + assert content == b"important data" + undo_log = [ UndoEntry( sequence=0, - op_type="fs_rm", - params={"uri": "/local/test"}, + op_type="fs_mv", + params={"src": src, "dst": dst}, completed=True, ), ] - execute_rollback(undo_log, agfs) - agfs.rm.assert_not_called() - agfs.mv.assert_not_called() + execute_rollback(undo_log, agfs_client) + assert file_exists(agfs_client, src) + restored = agfs_client.cat(f"{src}/payload.txt") + assert restored == b"important data" -class TestMvRollback: - def test_file_moved_back_on_vectordb_failure(self): - """When VectorDB update fails (incomplete), FS mv is reversed.""" - agfs = MagicMock() + +class TestRecoverAll: + def test_recover_all_reverses_incomplete(self, agfs_client, test_dir): + """recover_all=True also reverses entries with completed=False.""" + new_dir = f"{test_dir}/recover-all-dir" + _mkdir_ok(agfs_client, new_dir) undo_log = [ - UndoEntry( - sequence=0, - op_type="fs_mv", - params={"src": "/local/a", "dst": "/local/b"}, - completed=True, # FS mv succeeded - ), - UndoEntry( - sequence=1, - op_type="vectordb_update_uri", - params={ - "old_uri": "viking://a", - "new_uri": "viking://b", - "old_parent_uri": "viking://", - }, - completed=False, # VectorDB update never ran - ), + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), ] + execute_rollback(undo_log, agfs_client, recover_all=True) - execute_rollback(undo_log, agfs) + assert not file_exists(agfs_client, new_dir) - # Only fs_mv (completed=True) is reversed - agfs.mv.assert_called_once_with("/local/b", "/local/a") + def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir): + """recover_all=False skips entries with completed=False.""" + new_dir = f"{test_dir}/skip-incomplete" + _mkdir_ok(agfs_client, new_dir) + undo_log = [ + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), + ] + execute_rollback(undo_log, agfs_client, recover_all=False) -class TestRecoverAll: - def test_recover_all_reverses_incomplete_entries(self): - """recover_all=True (crash recovery mode) also reverses incomplete entries.""" - agfs = MagicMock() + assert file_exists(agfs_client, new_dir) + + +class TestMultiStepRollback: + def test_reverse_order_nested_dirs(self, agfs_client, test_dir): + """parent + child → rollback reverses in reverse sequence order.""" + parent = f"{test_dir}/multi-parent" + child = f"{test_dir}/multi-parent/child" + _mkdir_ok(agfs_client, parent) + _mkdir_ok(agfs_client, child) undo_log = [ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": "/local/newdir"}, - completed=True, - ), - UndoEntry( - sequence=1, - op_type="fs_mv", - params={"src": "/local/a", "dst": "/local/b"}, - completed=False, # Crash happened mid-operation - ), + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True), + UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True), ] + execute_rollback(undo_log, agfs_client) - execute_rollback(undo_log, agfs, recover_all=True) - - # Both entries should be reversed (in reverse sequence order) - assert agfs.mv.call_count == 1 - agfs.mv.assert_called_once_with("/local/b", "/local/a") - agfs.rm.assert_called_once_with("/local/newdir") + assert not file_exists(agfs_client, child) + assert not file_exists(agfs_client, parent) - def test_recover_all_false_skips_incomplete(self): - """recover_all=False (normal rollback) skips incomplete entries.""" - agfs = MagicMock() + def test_write_new_rollback(self, agfs_client, test_dir): + """New file → rollback → file deleted.""" + file_path = f"{test_dir}/new-file.txt" + agfs_client.write(file_path, b"new content") + assert file_exists(agfs_client, file_path) undo_log = [ UndoEntry( - sequence=0, - op_type="fs_mv", - params={"src": "/local/a", "dst": "/local/b"}, - completed=False, + sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True ), ] + execute_rollback(undo_log, agfs_client) - execute_rollback(undo_log, agfs, recover_all=False) - agfs.mv.assert_not_called() - + assert not file_exists(agfs_client, file_path) -class TestVectorDBRollbackEdgeCases: - def test_multi_record_vectordb_delete_rollback(self): - """Multiple VectorDB records in snapshot should all be restored.""" - agfs = MagicMock() - vector_store = AsyncMock() - ctx = MagicMock() + def test_best_effort_continues(self, agfs_client, test_dir): + """If one step fails, subsequent steps still execute.""" + real_dir = f"{test_dir}/best-effort-real" + _mkdir_ok(agfs_client, real_dir) - snapshot = [ - {"id": "r1", "uri": "viking://a", "content": "data1"}, - {"id": "r2", "uri": "viking://b", "content": "data2"}, - {"id": "r3", "uri": "viking://c", "content": "data3"}, - ] undo_log = [ + # seq=0: mkdir rollback on real dir → should succeed + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": real_dir}, completed=True), + # seq=1: mkdir rollback on nonexistent dir → fails silently UndoEntry( - sequence=0, - op_type="vectordb_delete", - params={ - "uris": ["viking://a", "viking://b", "viking://c"], - "records_snapshot": snapshot, - }, + sequence=1, + op_type="fs_mkdir", + params={"uri": f"{test_dir}/no-such-dir-{uuid.uuid4().hex}"}, completed=True, ), ] - execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + execute_rollback(undo_log, agfs_client) - assert vector_store.upsert.call_count == 3 + # seq=0 still executed despite seq=1 failure (reversed order: 1 runs first, then 0) + assert not file_exists(agfs_client, real_dir) - def test_empty_snapshot_vectordb_delete_rollback(self): - """Empty snapshot → nothing to restore, no error.""" - agfs = MagicMock() - vector_store = AsyncMock() - ctx = MagicMock() + def test_unknown_op_type_no_crash(self, agfs_client, test_dir): + """Unknown op_type is logged but doesn't raise.""" + undo_log = [ + UndoEntry( + sequence=0, + op_type="some_future_op", + params={"foo": "bar"}, + completed=True, + ), + ] + # Should not raise + execute_rollback(undo_log, agfs_client) + + +class TestVectorDBRollback: + async def test_vectordb_delete_rollback_restores(self, agfs_client, vector_store, request_ctx): + """upsert → delete → rollback(vectordb_delete) → record restored.""" + record_id = str(uuid.uuid4()) + record = { + "id": record_id, + "uri": f"viking://resources/del-restore-{record_id}.md", + "parent_uri": "viking://resources/", + "account_id": "default", + "context_type": "resource", + "level": 2, + "vector": [0.3] * VECTOR_DIM, + "name": "del-restore", + "description": "test", + "abstract": "test", + } + await vector_store.upsert(record, ctx=request_ctx) + + # Snapshot before delete + snapshot = await vector_store.get([record_id], ctx=request_ctx) + assert len(snapshot) == 1 + + # Forward: delete + await vector_store.delete([record_id], ctx=request_ctx) + assert len(await vector_store.get([record_id], ctx=request_ctx)) == 0 undo_log = [ UndoEntry( sequence=0, op_type="vectordb_delete", - params={"uris": [], "records_snapshot": []}, + params={ + "uris": [record["uri"]], + "records_snapshot": snapshot, + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, completed=True, ), ] - execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) - vector_store.upsert.assert_not_called() - - def test_vectordb_delete_partial_restore_failure(self): - """If restoring one record fails, others should still be attempted.""" - agfs = MagicMock() - vector_store = AsyncMock() - ctx = MagicMock() - - call_count = 0 - - async def upsert_side_effect(record): - nonlocal call_count - call_count += 1 - if record["id"] == "r2": - raise Exception("upsert failed") - - vector_store.upsert = AsyncMock(side_effect=upsert_side_effect) + execute_rollback(undo_log, agfs_client, vector_store=vector_store) + + results = await vector_store.get([record_id], ctx=request_ctx) + assert len(results) == 1 + + async def test_vectordb_delete_multi_record(self, agfs_client, vector_store, request_ctx): + """3 records in snapshot → rollback → all restored.""" + records = [] + for i in range(3): + rid = str(uuid.uuid4()) + rec = { + "id": rid, + "uri": f"viking://resources/multi-{rid}.md", + "parent_uri": "viking://resources/", + "account_id": "default", + "context_type": "resource", + "level": 2, + "vector": [0.1 * (i + 1)] * VECTOR_DIM, + "name": f"multi-{i}", + "description": "test", + "abstract": "test", + } + await vector_store.upsert(rec, ctx=request_ctx) + records.append(rec) + + ids = [r["id"] for r in records] + snapshot = await vector_store.get(ids, ctx=request_ctx) + assert len(snapshot) == 3 + + # Delete all + await vector_store.delete(ids, ctx=request_ctx) + assert len(await vector_store.get(ids, ctx=request_ctx)) == 0 - snapshot = [ - {"id": "r1", "uri": "viking://a"}, - {"id": "r2", "uri": "viking://b"}, # This one fails - {"id": "r3", "uri": "viking://c"}, - ] undo_log = [ UndoEntry( sequence=0, op_type="vectordb_delete", - params={"records_snapshot": snapshot}, + params={ + "uris": [r["uri"] for r in records], + "records_snapshot": snapshot, + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, completed=True, ), ] - execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) + execute_rollback(undo_log, agfs_client, vector_store=vector_store) - # All 3 should be attempted (best-effort per record) - assert call_count == 3 + results = await vector_store.get(ids, ctx=request_ctx) + assert len(results) == 3 - def test_vectordb_upsert_rollback_without_vector_store_is_noop(self): - """vectordb_upsert rollback without vector_store does nothing.""" - agfs = MagicMock() + async def test_vectordb_delete_empty_snapshot(self, agfs_client, vector_store, request_ctx): + """Empty snapshot → no-op, no error.""" undo_log = [ UndoEntry( sequence=0, - op_type="vectordb_upsert", - params={"record_id": "r1"}, + op_type="vectordb_delete", + params={ + "uris": [], + "records_snapshot": [], + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, completed=True, ), ] # Should not raise - execute_rollback(undo_log, agfs, vector_store=None) + execute_rollback(undo_log, agfs_client, vector_store=vector_store) + + async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, request_ctx): + """upsert → rollback(vectordb_upsert) → record deleted.""" + record_id = str(uuid.uuid4()) + record = { + "id": record_id, + "uri": f"viking://resources/upsert-del-{record_id}.md", + "parent_uri": "viking://resources/", + "account_id": "default", + "context_type": "resource", + "level": 2, + "vector": [0.4] * VECTOR_DIM, + "name": "upsert-del", + "description": "test", + "abstract": "test", + } + await vector_store.upsert(record, ctx=request_ctx) + assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1 - def test_unknown_op_type_does_not_crash(self): - """Unknown op_type is logged but doesn't raise.""" - agfs = MagicMock() undo_log = [ UndoEntry( sequence=0, - op_type="some_future_op", - params={"foo": "bar"}, + op_type="vectordb_upsert", + params={ + "record_id": record_id, + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, completed=True, ), ] - execute_rollback(undo_log, agfs) + execute_rollback(undo_log, agfs_client, vector_store=vector_store) + + results = await vector_store.get([record_id], ctx=request_ctx) + assert len(results) == 0 diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py index d67063d1..1a68fe6a 100644 --- a/tests/transaction/test_undo.py +++ b/tests/transaction/test_undo.py @@ -2,10 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for undo log and rollback executor.""" -from unittest.mock import AsyncMock, MagicMock +import uuid from openviking.storage.transaction.undo import UndoEntry, execute_rollback +from .conftest import VECTOR_DIM, _mkdir_ok, file_exists + class TestUndoEntry: def test_to_dict(self): @@ -35,129 +37,213 @@ def test_roundtrip(self): class TestExecuteRollback: - def test_rollback_fs_mv(self): - agfs = MagicMock() + """Integration tests for execute_rollback using real AGFS and VectorDB backends.""" + + def test_rollback_fs_mv(self, agfs_client, test_dir): + src = f"{test_dir}/src" + dst = f"{test_dir}/dst" + _mkdir_ok(agfs_client, src) + agfs_client.write(f"{src}/data.txt", b"hello") + + # Forward: mv src → dst + agfs_client.mv(src, dst) + assert not file_exists(agfs_client, src) + assert file_exists(agfs_client, dst) + undo_log = [ UndoEntry( - sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + sequence=0, + op_type="fs_mv", + params={"src": src, "dst": dst}, + completed=True, ), ] - execute_rollback(undo_log, agfs) - agfs.mv.assert_called_once_with("/b", "/a") + execute_rollback(undo_log, agfs_client) + + # src restored, dst gone + assert file_exists(agfs_client, src) + assert not file_exists(agfs_client, dst) + + def test_rollback_fs_rm_skipped(self, agfs_client, test_dir): + path = f"{test_dir}/will-not-delete" + _mkdir_ok(agfs_client, path) - def test_rollback_fs_rm_skipped(self): - agfs = MagicMock() undo_log = [ - UndoEntry(sequence=0, op_type="fs_rm", params={"uri": "/a"}, completed=True), + UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True), ] - execute_rollback(undo_log, agfs) - agfs.mv.assert_not_called() - agfs.rm.assert_not_called() + execute_rollback(undo_log, agfs_client) + + # fs_rm rollback is a no-op; directory still exists + assert file_exists(agfs_client, path) + + def test_rollback_fs_mkdir(self, agfs_client, test_dir): + new_dir = f"{test_dir}/created" + _mkdir_ok(agfs_client, new_dir) + assert file_exists(agfs_client, new_dir) - def test_rollback_fs_mkdir(self): - agfs = MagicMock() undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": "/a/b"}, completed=True), + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=True), ] - execute_rollback(undo_log, agfs) - agfs.rm.assert_called_once_with("/a/b") + execute_rollback(undo_log, agfs_client) + + assert not file_exists(agfs_client, new_dir) + + def test_rollback_fs_write_new(self, agfs_client, test_dir): + file_path = f"{test_dir}/new-file.txt" + agfs_client.write(file_path, b"content") + assert file_exists(agfs_client, file_path) - def test_rollback_fs_write_new(self): - agfs = MagicMock() undo_log = [ UndoEntry( - sequence=0, op_type="fs_write_new", params={"uri": "/a/f.txt"}, completed=True + sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True ), ] - execute_rollback(undo_log, agfs) - agfs.rm.assert_called_once_with("/a/f.txt", recursive=True) + execute_rollback(undo_log, agfs_client) + + assert not file_exists(agfs_client, file_path) + + def test_rollback_reverse_order(self, agfs_client, test_dir): + """mkdir parent + child → rollback → both removed in reverse order.""" + parent = f"{test_dir}/parent" + child = f"{test_dir}/parent/child" + _mkdir_ok(agfs_client, parent) + _mkdir_ok(agfs_client, child) - def test_rollback_vectordb_upsert(self): - agfs = MagicMock() - vector_store = AsyncMock() undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_upsert", - params={"record_id": "r1"}, - completed=True, - ), + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True), + UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True), ] - execute_rollback(undo_log, agfs, vector_store=vector_store) - vector_store.delete.assert_called_once_with(["r1"]) + execute_rollback(undo_log, agfs_client) + + # child removed first (seq=1), then parent (seq=0) + assert not file_exists(agfs_client, child) + assert not file_exists(agfs_client, parent) + + def test_rollback_skips_incomplete(self, agfs_client, test_dir): + new_dir = f"{test_dir}/incomplete" + _mkdir_ok(agfs_client, new_dir) - def test_rollback_vectordb_update_uri(self): - agfs = MagicMock() - ctx = MagicMock() - vector_store = AsyncMock() undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_update_uri", - params={ - "old_uri": "viking://a", - "new_uri": "viking://b", - "old_parent_uri": "viking://", - }, - completed=True, - ), + UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), ] - execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx) - vector_store.update_uri_mapping.assert_called_once_with( - ctx=ctx, uri="viking://b", new_uri="viking://a", new_parent_uri="viking://" - ) + execute_rollback(undo_log, agfs_client) - def test_rollback_reverse_order(self): - """Rollback should process entries in reverse sequence order.""" - agfs = MagicMock() - call_order = [] - original_mv = agfs.mv - original_rm = agfs.rm + # completed=False → not rolled back + assert file_exists(agfs_client, new_dir) - def track_mv(*args): - call_order.append(("mv", args)) - return original_mv(*args) - - def track_rm(*args, **kwargs): - call_order.append(("rm", args)) - return original_rm(*args, **kwargs) + def test_rollback_best_effort(self, agfs_client, test_dir): + """A failing rollback entry should not prevent others from running.""" + real_dir = f"{test_dir}/real-dir" + _mkdir_ok(agfs_client, real_dir) - agfs.mv = track_mv - agfs.rm = track_rm + src = f"{test_dir}/be-src" + dst = f"{test_dir}/be-dst" + _mkdir_ok(agfs_client, dst) undo_log = [ + # seq=0: fs_mv rollback will succeed + UndoEntry(sequence=0, op_type="fs_mv", params={"src": src, "dst": dst}, completed=True), + # seq=1: fs_mkdir rollback will fail (rm on non-empty or non-existent path) UndoEntry( - sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + sequence=1, + op_type="fs_mkdir", + params={"uri": f"{test_dir}/nonexistent-dir-xyz"}, + completed=True, ), - UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True), ] - execute_rollback(undo_log, agfs) - # seq=1 should be rolled back first (mkdir→rm), then seq=0 (mv→reverse mv) - assert call_order[0][0] == "rm" - assert call_order[1][0] == "mv" + # Should not raise + execute_rollback(undo_log, agfs_client) + + # seq=0 mv rollback should have executed (dst → src) + assert file_exists(agfs_client, src) + + async def test_rollback_vectordb_upsert(self, agfs_client, vector_store, request_ctx): + """Real upsert → rollback → record deleted.""" + record_id = str(uuid.uuid4()) + record = { + "id": record_id, + "uri": f"viking://resources/test-upsert-{record_id}.md", + "parent_uri": "viking://resources/", + "account_id": "default", + "context_type": "resource", + "level": 2, + "vector": [0.1] * VECTOR_DIM, + "name": "test", + "description": "test record", + "abstract": "test", + } + await vector_store.upsert(record, ctx=request_ctx) + + # Confirm it exists + results = await vector_store.get([record_id], ctx=request_ctx) + assert len(results) == 1 - def test_rollback_skips_incomplete(self): - agfs = MagicMock() undo_log = [ UndoEntry( - sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=False + sequence=0, + op_type="vectordb_upsert", + params={ + "record_id": record_id, + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, + completed=True, ), ] - execute_rollback(undo_log, agfs) - agfs.mv.assert_not_called() + execute_rollback(undo_log, agfs_client, vector_store=vector_store) + + results = await vector_store.get([record_id], ctx=request_ctx) + assert len(results) == 0 + + async def test_rollback_vectordb_update_uri(self, agfs_client, vector_store, request_ctx): + """Real upsert → update_uri_mapping → rollback → URI restored.""" + record_id = str(uuid.uuid4()) + old_uri = f"viking://resources/old-{record_id}.md" + new_uri = f"viking://resources/new-{record_id}.md" + record = { + "id": record_id, + "uri": old_uri, + "parent_uri": "viking://resources/", + "account_id": "default", + "context_type": "resource", + "level": 2, + "vector": [0.2] * VECTOR_DIM, + "name": "test", + "description": "test", + "abstract": "test", + } + await vector_store.upsert(record, ctx=request_ctx) + + # Forward: update URI mapping + await vector_store.update_uri_mapping( + ctx=request_ctx, + uri=old_uri, + new_uri=new_uri, + new_parent_uri="viking://resources/", + ) - def test_rollback_best_effort(self): - """A failing rollback entry should not prevent others from running.""" - agfs = MagicMock() - agfs.rm.side_effect = Exception("boom") - agfs.mv = MagicMock() + # Verify forward operation + result = await vector_store.fetch_by_uri(new_uri, ctx=request_ctx) + assert result is not None undo_log = [ UndoEntry( - sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True + sequence=0, + op_type="vectordb_update_uri", + params={ + "old_uri": old_uri, + "new_uri": new_uri, + "old_parent_uri": "viking://resources/", + "_ctx_account_id": "default", + "_ctx_user_id": "test_user", + "_ctx_role": "root", + }, + completed=True, ), - UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True), ] - execute_rollback(undo_log, agfs) - # fs_mkdir rollback failed (rm raises), but fs_mv rollback should still run - agfs.mv.assert_called_once_with("/b", "/a") + execute_rollback(undo_log, agfs_client, vector_store=vector_store) + + # URI should be restored to old_uri + result = await vector_store.fetch_by_uri(old_uri, ctx=request_ctx) + assert result is not None From 0122a3bc943dbc63893deb1240cd8d8e56b35c33 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 17:38:16 +0800 Subject: [PATCH 07/18] refactor(transaction): make rollback fully async and unify session commit path - Convert execute_rollback/rollback_entry to async, removing sync run_async wrappers - Unify Session.commit() to delegate to commit_async(), removing duplicate phase methods - Fix SUBTREE lock to conflict with ancestor SUBTREE locks (was previously missing) - Fix mv lock mode: directory moves now use SUBTREE on both source and destination - Replace deprecated asyncio.get_event_loop() with get_running_loop() - Remove max_parallel_locks config option - Update docs (en/zh) and tests to match new async rollback signatures --- docs/en/concepts/09-transaction.md | 48 +++-- docs/zh/concepts/09-transaction.md | 48 +++-- openviking/session/session.py | 188 +++++------------- openviking/storage/transaction/path_lock.py | 32 +-- .../transaction/transaction_manager.py | 4 +- openviking/storage/transaction/undo.py | 24 +-- tests/agfs/test_fs_binding.py | 9 +- tests/agfs/test_fs_binding_s3.py | 6 +- tests/agfs/test_fs_local.py | 5 +- tests/agfs/test_fs_s3.py | 5 +- tests/integration/test_add_resource_index.py | 25 ++- tests/session/test_memory_dedup_actions.py | 1 - tests/session/test_session_commit.py | 9 +- tests/storage/test_semantic_dag_skip_files.py | 29 ++- tests/storage/test_semantic_dag_stats.py | 2 +- tests/test_session_task_tracking.py | 2 +- tests/transaction/test_rm_rollback.py | 40 ++-- tests/transaction/test_undo.py | 32 +-- 18 files changed, 260 insertions(+), 249 deletions(-) diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md index 3469ed2f..48e6d355 100644 --- a/docs/en/concepts/09-transaction.md +++ b/docs/en/concepts/09-transaction.md @@ -68,7 +68,7 @@ Rollback: Step 4 fails -> restore VectorDB records from snapshot. Transaction flow: ``` -1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on source + POINT on destination) +1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on both source and destination for directories) 2. Move FS file 3. Update VectorDB URIs 4. Commit -> release lock -> delete journal @@ -151,8 +151,8 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as | lock_mode | Use case | Behavior | |-----------|----------|----------| | `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors | -| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path and any lock on descendants | -| `mv` | Move operations | Acquire SUBTREE lock on source path, then POINT lock on destination path | +| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors | +| `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) | ## Lock Types (POINT vs SUBTREE) @@ -161,10 +161,10 @@ The lock mechanism uses two lock types to handle different conflict patterns: | | POINT on same path | SUBTREE on same path | POINT on descendant | SUBTREE on ancestor | |---|---|---|---|---| | **POINT** | Conflict | Conflict | — | Conflict | -| **SUBTREE** | Conflict | Conflict | Conflict | — | +| **SUBTREE** | Conflict | Conflict | Conflict | Conflict | - **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock. -- **SUBTREE (S)**: Used for rm and mv-source operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants for conflicting locks. +- **SUBTREE (S)**: Used for rm and mv operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants and ancestor directories for conflicting locks. ## Undo Log @@ -182,6 +182,17 @@ Each transaction maintains an Undo Log recording the reverse action for each ste Rollback rules: Only entries with `completed=True` are rolled back, in **reverse order**. Each step has independent try-catch (best-effort). During crash recovery, `recover_all=True` also reverses uncompleted entries to clean up partial operations. +### Context Reconstruction + +VectorDB rollback operations require a `RequestContext` (containing account_id, user_id, agent_id, role). Since the original context is unavailable during crash recovery, `_ctx_*` fields are serialized into undo params when calling record_undo: + +- `_ctx_account_id`: Account ID +- `_ctx_user_id`: User ID +- `_ctx_agent_id`: Agent ID +- `_ctx_role`: Role + +During rollback, `_reconstruct_ctx()` rebuilds the context from these fields. If reconstruction fails (missing fields), the VectorDB rollback step is skipped with a warning. + ## Lock Mechanism ### Lock Protocol @@ -223,12 +234,23 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError loop until timeout (poll interval: 200ms): 1. Check target directory exists 2. Check if target directory is locked by another transaction - 3. Scan all descendant directories for any locks by other transactions - 4. Write SUBTREE (S) lock file (only one file, at the root path) - 5. TOCTOU double-check: re-scan descendants for new locks - - Conflict found: later one backs off (livelock prevention) - 6. Verify lock file ownership - 7. Success + - Stale lock? -> remove and retry + - Active lock? -> wait + 3. Check all ancestor directories for SUBTREE locks + - Stale lock? -> remove and retry + - Active lock? -> wait + 4. Scan all descendant directories for any locks by other transactions + - Stale lock? -> remove and retry + - Active lock? -> wait + 5. Write SUBTREE (S) lock file (only one file, at the root path) + 6. TOCTOU double-check: re-scan descendants and ancestors + - Conflict found: compare (timestamp, tx_id) + - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock + - Wait and retry + 7. Verify lock file ownership (fencing token matches) + 8. Success + +Timeout (default 0 = no-wait) raises LockAcquisitionError ``` ### Lock Expiry Cleanup @@ -305,8 +327,7 @@ The transaction mechanism is enabled by default with no extra configuration need "storage": { "transaction": { "lock_timeout": 5.0, - "lock_expire": 300.0, - "max_parallel_locks": 8 + "lock_expire": 300.0 } } } @@ -316,7 +337,6 @@ The transaction mechanism is enabled by default with no extra configuration need |-----------|------|-------------|---------| | `lock_timeout` | float | Lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds. | `0.0` | | `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` | -| `max_parallel_locks` | int | Max parallel locks for rm/mv operations | `8` | ### QueueFS Persistence diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 6397cd2d..e4f1b8d0 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -68,7 +68,7 @@ Storage Layer (VikingFS, VectorDB, QueueManager) 事务流程: ``` -1. 开始事务,加锁(lock_mode="mv",源路径 SUBTREE + 目标路径 POINT) +1. 开始事务,加锁(lock_mode="mv",目录移动时源和目标均 SUBTREE) 2. 移动 FS 文件 3. 更新 VectorDB 中的 URI 4. 提交 → 删锁 → 删 journal @@ -151,8 +151,8 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as | lock_mode | 用途 | 行为 | |-----------|------|------| | `point` | 写操作 | 锁定指定路径;与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 | -| `subtree` | 删除操作 | 锁定子树根节点;与同路径的任何锁和后代目录的任何锁冲突 | -| `mv` | 移动操作 | 源路径加 SUBTREE 锁,目标路径加 POINT 锁 | +| `subtree` | 删除操作 | 锁定子树根节点;与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 | +| `mv` | 移动操作 | 目录移动:源和目标均加 SUBTREE 锁;文件移动:源父目录和目标均加 POINT 锁(通过 `src_is_dir` 控制) | ## 锁类型(POINT vs SUBTREE) @@ -161,10 +161,10 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as | | 同路径 POINT | 同路径 SUBTREE | 后代 POINT | 祖先 SUBTREE | |---|---|---|---|---| | **POINT** | 冲突 | 冲突 | — | 冲突 | -| **SUBTREE** | 冲突 | 冲突 | 冲突 | — | +| **SUBTREE** | 冲突 | 冲突 | 冲突 | 冲突 | - **POINT (P)**:用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。 -- **SUBTREE (S)**:用于删除和移动源操作。逻辑上覆盖整个子树,但只在根目录写**一个锁文件**。获取前扫描所有后代确认无冲突锁。 +- **SUBTREE (S)**:用于删除和移动操作。逻辑上覆盖整个子树,但只在根目录写**一个锁文件**。获取前扫描所有后代和祖先目录确认无冲突锁。 ## Undo Log @@ -182,6 +182,17 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as 回滚规则:只回滚 `completed=True` 的条目,**反序执行**。每步独立 try-catch(best-effort)。崩溃恢复时使用 `recover_all=True`,也会回滚未完成的条目以清理部分操作残留。 +### 上下文重建 + +VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、agent_id、role)。由于崩溃恢复时原始上下文不可用,record_undo 时在 undo params 中序列化 `_ctx_*` 字段: + +- `_ctx_account_id`:账户 ID +- `_ctx_user_id`:用户 ID +- `_ctx_agent_id`:代理 ID +- `_ctx_role`:角色 + +回滚时通过 `_reconstruct_ctx()` 从这些字段重建上下文。若重建失败(字段缺失),该 VectorDB 回滚步骤将被跳过并记录警告。 + ## 锁机制 ### 锁协议 @@ -223,12 +234,23 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as 循环直到超时(轮询间隔:200ms): 1. 检查目标目录存在 2. 检查目标路径是否被其他事务锁定 - 3. 扫描所有后代目录,检查是否有其他事务持有的锁 - 4. 写入 SUBTREE (S) 锁文件(只写一个文件,在根路径) - 5. TOCTOU 双重检查:重新扫描后代目录 - - 发现冲突:后到者主动让步(活锁防止) - 6. 验证锁文件归属 - 7. 成功 + - 陈旧锁? → 移除后重试 + - 活跃锁? → 等待 + 3. 检查所有祖先目录是否有 SUBTREE 锁 + - 陈旧锁? → 移除后重试 + - 活跃锁? → 等待 + 4. 扫描所有后代目录,检查是否有其他事务持有的锁 + - 陈旧锁? → 移除后重试 + - 活跃锁? → 等待 + 5. 写入 SUBTREE (S) 锁文件(只写一个文件,在根路径) + 6. TOCTOU 双重检查:重新扫描后代目录和祖先目录 + - 发现冲突:比较 (timestamp, tx_id) + - 后到者(更大的 timestamp/tx_id)主动让步(删除自己的锁),防止活锁 + - 等待后重试 + 7. 验证锁文件归属(fencing token 匹配) + 8. 成功 + +超时(默认 0 = 不等待)抛出 LockAcquisitionError ``` ### 锁过期清理 @@ -305,8 +327,7 @@ INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED "storage": { "transaction": { "lock_timeout": 5.0, - "lock_expire": 300.0, - "max_parallel_locks": 8 + "lock_expire": 300.0 } } } @@ -316,7 +337,6 @@ INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED |------|------|------|--------| | `lock_timeout` | float | 获取锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间 | `0.0` | | `lock_expire` | float | 锁过期时间(秒),超过此时间的事务锁将被视为陈旧锁并强制释放 | `300.0` | -| `max_parallel_locks` | int | rm/mv 操作的最大并行加锁数 | `8` | ### QueueFS 持久化 diff --git a/openviking/session/session.py b/openviking/session/session.py index 88444adb..b861deea 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -220,13 +220,17 @@ def update_tool_part( self._update_message_in_jsonl() def commit(self) -> Dict[str, Any]: - """Commit session: two-phase transaction with checkpoint. + """Sync wrapper for commit_async().""" + return run_async(self.commit_async()) + + async def commit_async(self) -> Dict[str, Any]: + """Async commit session: two-phase transaction with checkpoint. Phase 1 (Archive): Lock session, write archive, clear messages, write checkpoint. LLM call (no transaction): Extract long-term memories. Phase 2 (Memory): Lock session, write memories + relations, update checkpoint. """ - from openviking.storage.transaction import get_transaction_manager + from openviking.storage.transaction import TransactionContext, get_transaction_manager result = { "session_id": self.session_id, @@ -247,20 +251,30 @@ def commit(self) -> Dict[str, Any]: self._compression.compression_index += 1 messages_to_archive = self._messages.copy() - summary = self._generate_archive_summary(messages_to_archive) + summary = await self._generate_archive_summary_async(messages_to_archive) archive_abstract = self._extract_abstract_from_summary(summary) archive_overview = summary - run_async( - self._phase1_archive_async( - tx_manager, - session_path, - self._compression.compression_index, - messages_to_archive, - archive_abstract, - archive_overview, + async with TransactionContext( + tx_manager, "session_archive", [session_path], lock_mode="point" + ) as tx: + archive_uri = ( + f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}" ) - ) + archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx) + seq = tx.record_undo("fs_write_new", {"uri": archive_path}) + await self._write_archive_async( + index=self._compression.compression_index, + messages=messages_to_archive, + abstract=archive_abstract, + overview=archive_overview, + ) + await self._write_to_agfs_async(messages=[]) + await self._write_checkpoint_async( + {"status": "archived", "archive_index": self._compression.compression_index} + ) + tx.mark_completed(seq) + await tx.commit() self._compression.original_count += len(messages_to_archive) result["archived"] = True @@ -275,13 +289,11 @@ def commit(self) -> Dict[str, Any]: logger.info( f"Starting memory extraction from {len(messages_to_archive)} archived messages" ) - memories = run_async( - self._session_compressor.extract_long_term_memories( - messages=messages_to_archive, - user=self.user, - session_id=self.session_id, - ctx=self.ctx, - ) + memories = await self._session_compressor.extract_long_term_memories( + messages=messages_to_archive, + user=self.user, + session_id=self.session_id, + ctx=self.ctx, ) logger.info(f"Extracted {len(memories)} memories") result["memories_extracted"] = len(memories) @@ -289,64 +301,12 @@ def commit(self) -> Dict[str, Any]: get_current_telemetry().set("memory.extracted", len(memories)) # ===== Phase 2: Memory write ===== - run_async(self._phase2_memory_async(tx_manager, session_path)) - - # Update active_count - active_count_updated = self._update_active_counts() - result["active_count_updated"] = active_count_updated - - # Update statistics - self._stats.compression_count = self._compression.compression_index - result["stats"] = { - "total_turns": self._stats.total_turns, - "contexts_used": self._stats.contexts_used, - "skills_used": self._stats.skills_used, - "memories_extracted": self._stats.memories_extracted, - } - - self._stats.total_tokens = 0 - logger.info(f"Session {self.session_id} committed") - return result - - async def _phase1_archive_async( - self, - tx_manager: Any, - session_path: str, - compression_index: int, - messages_to_archive: list, - archive_abstract: str, - archive_overview: str, - ) -> None: - """Phase 1 of commit: archive messages inside a transaction.""" - from openviking.storage.transaction import TransactionContext - - async with TransactionContext( - tx_manager, "session_archive", [session_path], lock_mode="point" - ) as tx: - archive_uri = f"{self._session_uri}/history/archive_{compression_index:03d}" - archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx) - seq = tx.record_undo("fs_write_new", {"uri": archive_path}) - self._write_archive( - index=compression_index, - messages=messages_to_archive, - abstract=archive_abstract, - overview=archive_overview, - ) - self._write_to_agfs(messages=[]) - self._write_checkpoint({"status": "archived", "archive_index": compression_index}) - tx.mark_completed(seq) - await tx.commit() - - async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None: - """Phase 2 of commit: write memories inside a transaction.""" - from openviking.storage.transaction import TransactionContext - async with TransactionContext( tx_manager, "session_memory", [session_path], lock_mode="point" ) as tx: - self._write_to_agfs(self._messages) - self._write_relations() - self._write_checkpoint({"status": "completed"}) + await self._write_to_agfs_async(self._messages) + await self._write_relations_async() + await self._write_checkpoint_async({"status": "completed"}) tx.add_post_action( "enqueue_semantic", { @@ -360,70 +320,11 @@ async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None ) await tx.commit() - async def commit_async(self) -> Dict[str, Any]: - """Async commit session: create archive, extract memories, persist.""" - result = { - "session_id": self.session_id, - "status": "committed", - "memories_extracted": 0, - "active_count_updated": 0, - "archived": False, - "stats": None, - } - if not self._messages: - get_current_telemetry().set("memory.extracted", 0) - return result - - # 1. Archive current messages - self._compression.compression_index += 1 - messages_to_archive = self._messages.copy() - - summary = await self._generate_archive_summary_async(messages_to_archive) - archive_abstract = self._extract_abstract_from_summary(summary) - archive_overview = summary - - await self._write_archive_async( - index=self._compression.compression_index, - messages=messages_to_archive, - abstract=archive_abstract, - overview=archive_overview, - ) - - self._compression.original_count += len(messages_to_archive) - result["archived"] = True - - self._messages.clear() - logger.info( - f"Archived: {len(messages_to_archive)} messages → history/archive_{self._compression.compression_index:03d}/" - ) - - # 2. Extract long-term memories - if self._session_compressor: - logger.info( - f"Starting memory extraction from {len(messages_to_archive)} archived messages" - ) - memories = await self._session_compressor.extract_long_term_memories( - messages=messages_to_archive, - user=self.user, - session_id=self.session_id, - ctx=self.ctx, - ) - logger.info(f"Extracted {len(memories)} memories") - result["memories_extracted"] = len(memories) - self._stats.memories_extracted += len(memories) - get_current_telemetry().set("memory.extracted", len(memories)) - - # 3. Write current messages to AGFS - await self._write_to_agfs_async(self._messages) - - # 4. Create relations - await self._write_relations_async() - - # 5. Update active_count + # Update active_count active_count_updated = await self._update_active_counts_async() result["active_count_updated"] = active_count_updated - # 6. Update statistics + # Update statistics self._stats.compression_count = self._compression.compression_index result["stats"] = { "total_turns": self._stats.total_turns, @@ -844,6 +745,23 @@ def _write_checkpoint(self, data: Dict[str, Any]) -> None: ) ) + async def _write_checkpoint_async(self, data: Dict[str, Any]) -> None: + """Write a commit checkpoint file for crash recovery (async).""" + if not self._viking_fs: + return + + checkpoint = { + **data, + "session_id": self.session_id, + "compression_index": self._compression.compression_index, + "timestamp": get_current_timestamp(), + } + await self._viking_fs.write_file( + f"{self._session_uri}/.commit_checkpoint.json", + json.dumps(checkpoint, ensure_ascii=False), + ctx=self.ctx, + ) + def _read_checkpoint(self) -> Optional[Dict[str, Any]]: """Read commit checkpoint file if it exists.""" if not self._viking_fs: diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index a67cb6bc..8b412a67 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -159,7 +159,7 @@ async def acquire_point( ) -> bool: transaction_id = transaction.id lock_path = self._get_lock_path(path) - deadline = asyncio.get_event_loop().time() + timeout + deadline = asyncio.get_running_loop().time() + timeout try: self._agfs.stat(path) @@ -172,12 +172,12 @@ async def acquire_point( if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[POINT] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning(f"[POINT] Timeout waiting for lock on: {path}") return False await asyncio.sleep(_POLL_INTERVAL) continue - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning(f"[POINT] Timeout waiting for lock on: {path}") return False await asyncio.sleep(_POLL_INTERVAL) @@ -190,14 +190,14 @@ async def acquire_point( f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" ) await self._remove_lock_file(ancestor_conflict) - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning( f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" ) return False await asyncio.sleep(_POLL_INTERVAL) continue - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning( f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" ) @@ -225,7 +225,7 @@ async def acquire_point( logger.debug(f"[POINT] Backing off (livelock guard) on {path}") await self._remove_lock_file(lock_path) backed_off = True - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: if not backed_off: await self._remove_lock_file(lock_path) return False @@ -234,7 +234,7 @@ async def acquire_point( if not await self._verify_lock_ownership(lock_path, transaction_id): logger.debug(f"[POINT] Lock ownership verification failed: {path}") - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: return False await asyncio.sleep(_POLL_INTERVAL) continue @@ -248,7 +248,7 @@ async def acquire_subtree( ) -> bool: transaction_id = transaction.id lock_path = self._get_lock_path(path) - deadline = asyncio.get_event_loop().time() + timeout + deadline = asyncio.get_running_loop().time() + timeout try: self._agfs.stat(path) @@ -261,12 +261,12 @@ async def acquire_subtree( if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") return False await asyncio.sleep(_POLL_INTERVAL) continue - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") return False await asyncio.sleep(_POLL_INTERVAL) @@ -280,14 +280,14 @@ async def acquire_subtree( f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" ) await self._remove_lock_file(ancestor_conflict) - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning( f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" ) return False await asyncio.sleep(_POLL_INTERVAL) continue - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning( f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" ) @@ -300,14 +300,14 @@ async def acquire_subtree( if self.is_lock_stale(desc_conflict, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}") await self._remove_lock_file(desc_conflict) - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning( f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}" ) return False await asyncio.sleep(_POLL_INTERVAL) continue - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: logger.warning( f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}" ) @@ -337,7 +337,7 @@ async def acquire_subtree( logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}") await self._remove_lock_file(lock_path) backed_off = True - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: if not backed_off: await self._remove_lock_file(lock_path) return False @@ -346,7 +346,7 @@ async def acquire_subtree( if not await self._verify_lock_ownership(lock_path, transaction_id): logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}") - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: return False await asyncio.sleep(_POLL_INTERVAL) continue diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index 7b40c6be..379684a6 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -221,7 +221,7 @@ async def _recover_one(self, tx_id: str) -> None: # Pass recover_all=True so partial (completed=False) ops are also reversed, # e.g. a directory mv that started but never finished still leaves residue. try: - execute_rollback( + await execute_rollback( tx.undo_log, self._agfs, vector_store=self._vector_store, @@ -397,7 +397,7 @@ async def rollback(self, transaction_id: str) -> bool: # Execute undo log (best-effort) if tx.undo_log: try: - execute_rollback( + await execute_rollback( tx.undo_log, self._agfs, vector_store=self._vector_store, diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py index a77aa5aa..0b5b3113 100644 --- a/openviking/storage/transaction/undo.py +++ b/openviking/storage/transaction/undo.py @@ -73,7 +73,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "UndoEntry": ) -def execute_rollback( +async def execute_rollback( undo_log: List[UndoEntry], agfs: Any, vector_store: Optional[Any] = None, @@ -102,7 +102,7 @@ def execute_rollback( for entry in entries: try: - _rollback_entry(entry, agfs, vector_store, ctx) + await _rollback_entry(entry, agfs, vector_store, ctx) logger.info(f"[Rollback] Reversed {entry.op_type} seq={entry.sequence}") except Exception as e: logger.warning( @@ -110,15 +110,13 @@ def execute_rollback( ) -def _rollback_entry( +async def _rollback_entry( entry: UndoEntry, agfs: Any, vector_store: Optional[Any], ctx: Optional[Any], ) -> None: """Dispatch rollback for a single undo entry.""" - from openviking_cli.utils import run_async - op = entry.op_type params = entry.params @@ -146,7 +144,7 @@ def _rollback_entry( if record_id: restored_ctx = _reconstruct_ctx(params) if restored_ctx: - run_async(vector_store.delete([record_id], ctx=restored_ctx)) + await vector_store.delete([record_id], ctx=restored_ctx) else: logger.warning("[Rollback] vectordb_upsert: cannot reconstruct ctx, skipping") @@ -159,7 +157,7 @@ def _rollback_entry( records_snapshot = params.get("records_snapshot", []) for record in records_snapshot: try: - run_async(vector_store.upsert(record, ctx=restored_ctx)) + await vector_store.upsert(record, ctx=restored_ctx) except Exception as e: logger.warning(f"[Rollback] Failed to restore vector record: {e}") @@ -169,13 +167,11 @@ def _rollback_entry( if restored_ctx is None: logger.warning("[Rollback] vectordb_update_uri: cannot reconstruct ctx, skipping") else: - run_async( - vector_store.update_uri_mapping( - ctx=restored_ctx, - uri=params["new_uri"], - new_uri=params["old_uri"], - new_parent_uri=params.get("old_parent_uri", ""), - ) + await vector_store.update_uri_mapping( + ctx=restored_ctx, + uri=params["new_uri"], + new_uri=params["old_uri"], + new_parent_uri=params.get("old_parent_uri", ""), ) else: diff --git a/tests/agfs/test_fs_binding.py b/tests/agfs/test_fs_binding.py index ed8d3d33..e55ff6fd 100644 --- a/tests/agfs/test_fs_binding.py +++ b/tests/agfs/test_fs_binding.py @@ -13,6 +13,7 @@ import pytest +from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -32,16 +33,16 @@ async def viking_fs_binding_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize TransactionManager and VikingFS with client + init_transaction_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) # make sure default/temp directory exists await vfs.mkdir("viking://temp/", exist_ok=True) - # Ensure test directory exists - await vfs.mkdir("viking://temp/", exist_ok=True) - yield vfs + reset_transaction_manager() + @pytest.mark.asyncio class TestVikingFSBindingLocal: diff --git a/tests/agfs/test_fs_binding_s3.py b/tests/agfs/test_fs_binding_s3.py index 692b869d..aa7a753b 100644 --- a/tests/agfs/test_fs_binding_s3.py +++ b/tests/agfs/test_fs_binding_s3.py @@ -13,6 +13,7 @@ import pytest +from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -57,11 +58,14 @@ async def viking_fs_binding_s3_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize TransactionManager and VikingFS with client + init_transaction_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) yield vfs + reset_transaction_manager() + @pytest.mark.asyncio class TestVikingFSBindingS3: diff --git a/tests/agfs/test_fs_local.py b/tests/agfs/test_fs_local.py index 3a428ed6..9e59f610 100644 --- a/tests/agfs/test_fs_local.py +++ b/tests/agfs/test_fs_local.py @@ -10,6 +10,7 @@ import pytest from openviking.agfs_manager import AGFSManager +from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -39,13 +40,15 @@ async def viking_fs_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize TransactionManager and VikingFS with client + init_transaction_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) # make sure default/temp directory exists await vfs.mkdir("viking://temp/", exist_ok=True) yield vfs + reset_transaction_manager() # AGFSManager.stop is synchronous manager.stop() diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py index ff9647e4..67a54e40 100644 --- a/tests/agfs/test_fs_s3.py +++ b/tests/agfs/test_fs_s3.py @@ -13,6 +13,7 @@ import pytest from openviking.agfs_manager import AGFSManager +from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager from openviking.storage.viking_fs import VikingFS, init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -82,11 +83,13 @@ async def viking_fs_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize VikingFS with client + # Initialize TransactionManager and VikingFS with client + init_transaction_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) yield vfs + reset_transaction_manager() # AGFSManager.stop is synchronous manager.stop() diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py index 2a35462a..84e1ebbe 100644 --- a/tests/integration/test_add_resource_index.py +++ b/tests/integration/test_add_resource_index.py @@ -1,6 +1,6 @@ import json import os -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -86,6 +86,19 @@ async def test_add_resource_indexing_logic(test_config, tmp_path): mock_agfs = MockLocalAGFS(root_path=tmp_path / "mock_agfs_root") + # Create mock parse result for Phase 1 (media processor) + mock_parse_result = MagicMock() + mock_parse_result.source_path = str(resource_file) + mock_parse_result.meta = {} + mock_parse_result.temp_dir_path = "/tmp/fake_temp_dir" + mock_parse_result.warnings = [] + mock_parse_result.source_format = "markdown" + + # Create mock context tree for Phase 2/3 (tree builder) + mock_context_tree = MagicMock() + mock_context_tree.root = MagicMock() + mock_context_tree.root.uri = "viking://resources/test_doc" + # Patch the Summarizer and IndexBuilder to verify calls with ( patch( @@ -94,6 +107,16 @@ async def test_add_resource_indexing_logic(test_config, tmp_path): patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs), patch("openviking.agfs_manager.AGFSManager.start"), patch("openviking.agfs_manager.AGFSManager.stop"), + patch( + "openviking.utils.media_processor.UnifiedResourceProcessor.process", + new_callable=AsyncMock, + return_value=mock_parse_result, + ), + patch( + "openviking.parse.tree_builder.TreeBuilder.finalize_from_temp", + new_callable=AsyncMock, + return_value=mock_context_tree, + ), ): mock_summarize.return_value = {"status": "success"} diff --git a/tests/session/test_memory_dedup_actions.py b/tests/session/test_memory_dedup_actions.py index e7bb1a80..0f8f94e6 100644 --- a/tests/session/test_memory_dedup_actions.py +++ b/tests/session/test_memory_dedup_actions.py @@ -179,7 +179,6 @@ async def test_find_similar_memories_uses_path_must_filter_and__score(self): assert len(similar) == 1 assert similar[0].uri == existing.uri call = vikingdb.search_similar_memories.await_args.kwargs - assert call["account_id"] == "acc1" assert call["owner_space"] == _make_user().user_space_name() assert call["category_uri_prefix"] == ( f"viking://user/{_make_user().user_space_name()}/memories/preferences/" diff --git a/tests/session/test_session_commit.py b/tests/session/test_session_commit.py index 60a42d02..efa57fc7 100644 --- a/tests/session/test_session_commit.py +++ b/tests/session/test_session_commit.py @@ -6,9 +6,6 @@ from openviking import AsyncOpenViking from openviking.message import TextPart from openviking.session import Session -from tests.utils.mock_context import make_test_ctx - -ctx = make_test_ctx() class TestCommit: @@ -98,12 +95,14 @@ async def test_active_count_incremented_after_commit(self, client_with_resource_ """ client, uri = client_with_resource_sync vikingdb = client._client.service.vikingdb_manager + # Use the client's own context to match the account_id used when adding the resource + client_ctx = client._client._ctx # Look up the record by URI records_before = await vikingdb.get_context_by_uri( uri=uri, limit=1, - ctx=ctx, + ctx=client_ctx, ) assert records_before, f"Resource not found for URI: {uri}" count_before = records_before[0].get("active_count") or 0 @@ -121,7 +120,7 @@ async def test_active_count_incremented_after_commit(self, client_with_resource_ records_after = await vikingdb.get_context_by_uri( uri=uri, limit=1, - ctx=ctx, + ctx=client_ctx, ) assert records_after, f"Record disappeared after commit for URI: {uri}" count_after = records_after[0].get("active_count") or 0 diff --git a/tests/storage/test_semantic_dag_skip_files.py b/tests/storage/test_semantic_dag_skip_files.py index 75b23314..3c6fdd61 100644 --- a/tests/storage/test_semantic_dag_skip_files.py +++ b/tests/storage/test_semantic_dag_skip_files.py @@ -1,6 +1,8 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 +from unittest.mock import AsyncMock, MagicMock + import pytest from openviking.server.identity import RequestContext, Role @@ -8,6 +10,24 @@ from openviking_cli.session.user_id import UserIdentifier +def _mock_transaction_layer(monkeypatch): + """Patch transaction layer to no-op for DAG tests.""" + mock_tx = MagicMock() + mock_tx.commit = AsyncMock() + monkeypatch.setattr( + "openviking.storage.transaction.context_manager.TransactionContext.__aenter__", + AsyncMock(return_value=mock_tx), + ) + monkeypatch.setattr( + "openviking.storage.transaction.context_manager.TransactionContext.__aexit__", + AsyncMock(return_value=False), + ) + monkeypatch.setattr( + "openviking.storage.transaction.get_transaction_manager", + lambda: MagicMock(), + ) + + class _FakeVikingFS: def __init__(self, tree): self._tree = tree @@ -19,6 +39,9 @@ async def ls(self, uri, ctx=None): async def write_file(self, path, content, ctx=None): self.writes.append((path, content)) + def _uri_to_path(self, uri, ctx=None): + return uri.replace("viking://", "/local/acc1/") + class _FakeProcessor: def __init__(self): @@ -47,7 +70,8 @@ async def _vectorize_single_file( @pytest.mark.asyncio async def test_messages_jsonl_excluded_from_summary(monkeypatch): """messages.jsonl should be skipped by _list_dir and never summarized.""" - root_uri = "viking://sessions/test-session" + _mock_transaction_layer(monkeypatch) + root_uri = "viking://session/test-session" tree = { root_uri: [ {"name": "messages.jsonl", "isDir": False}, @@ -77,7 +101,8 @@ async def test_messages_jsonl_excluded_from_summary(monkeypatch): @pytest.mark.asyncio async def test_messages_jsonl_excluded_in_subdirectory(monkeypatch): """messages.jsonl in a subdirectory should also be skipped.""" - root_uri = "viking://sessions/test-session" + _mock_transaction_layer(monkeypatch) + root_uri = "viking://session/test-session" tree = { root_uri: [ {"name": "subdir", "isDir": True}, diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py index 202db790..85f4cb8b 100644 --- a/tests/storage/test_semantic_dag_stats.py +++ b/tests/storage/test_semantic_dag_stats.py @@ -76,7 +76,7 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch): AsyncMock(return_value=False), ) monkeypatch.setattr( - "openviking.storage.transaction.transaction_manager.get_transaction_manager", + "openviking.storage.transaction.get_transaction_manager", lambda: MagicMock(), ) diff --git a/tests/test_session_task_tracking.py b/tests/test_session_task_tracking.py index 8a61fe4d..1306d500 100644 --- a/tests/test_session_task_tracking.py +++ b/tests/test_session_task_tracking.py @@ -181,7 +181,7 @@ async def test_task_failed_when_memory_extraction_raises(api_client): async def failing_extract(_context, _user, _session_id): raise RuntimeError("memory_extraction_failed: synthetic extractor error") - service.sessions._session_compressor.extractor.extract_strict = failing_extract + service.sessions._session_compressor.extractor.extract = failing_extract resp = await client.post(f"/api/v1/sessions/{session_id}/commit", params={"wait": False}) task_id = resp.json()["result"]["task_id"] diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py index 68f5e8b4..604b5f50 100644 --- a/tests/transaction/test_rm_rollback.py +++ b/tests/transaction/test_rm_rollback.py @@ -10,7 +10,7 @@ class TestRmRollback: - def test_fs_rm_not_reversible(self, agfs_client, test_dir): + async def test_fs_rm_not_reversible(self, agfs_client, test_dir): """fs_rm is intentionally irreversible: even completed=True is a no-op.""" path = f"{test_dir}/rm-target" _mkdir_ok(agfs_client, path) @@ -18,14 +18,14 @@ def test_fs_rm_not_reversible(self, agfs_client, test_dir): undo_log = [ UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # Directory still exists — fs_rm rollback does nothing assert file_exists(agfs_client, path) class TestMvRollback: - def test_mv_reversed_on_rollback(self, agfs_client, test_dir): + async def test_mv_reversed_on_rollback(self, agfs_client, test_dir): """Real mv → rollback → content back at original location.""" src = f"{test_dir}/mv-src" dst = f"{test_dir}/mv-dst" @@ -46,7 +46,7 @@ def test_mv_reversed_on_rollback(self, agfs_client, test_dir): completed=True, ), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) assert file_exists(agfs_client, src) restored = agfs_client.cat(f"{src}/payload.txt") @@ -54,7 +54,7 @@ def test_mv_reversed_on_rollback(self, agfs_client, test_dir): class TestRecoverAll: - def test_recover_all_reverses_incomplete(self, agfs_client, test_dir): + async def test_recover_all_reverses_incomplete(self, agfs_client, test_dir): """recover_all=True also reverses entries with completed=False.""" new_dir = f"{test_dir}/recover-all-dir" _mkdir_ok(agfs_client, new_dir) @@ -62,11 +62,11 @@ def test_recover_all_reverses_incomplete(self, agfs_client, test_dir): undo_log = [ UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), ] - execute_rollback(undo_log, agfs_client, recover_all=True) + await execute_rollback(undo_log, agfs_client, recover_all=True) assert not file_exists(agfs_client, new_dir) - def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir): + async def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir): """recover_all=False skips entries with completed=False.""" new_dir = f"{test_dir}/skip-incomplete" _mkdir_ok(agfs_client, new_dir) @@ -74,13 +74,13 @@ def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir): undo_log = [ UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), ] - execute_rollback(undo_log, agfs_client, recover_all=False) + await execute_rollback(undo_log, agfs_client, recover_all=False) assert file_exists(agfs_client, new_dir) class TestMultiStepRollback: - def test_reverse_order_nested_dirs(self, agfs_client, test_dir): + async def test_reverse_order_nested_dirs(self, agfs_client, test_dir): """parent + child → rollback reverses in reverse sequence order.""" parent = f"{test_dir}/multi-parent" child = f"{test_dir}/multi-parent/child" @@ -91,12 +91,12 @@ def test_reverse_order_nested_dirs(self, agfs_client, test_dir): UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True), UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) assert not file_exists(agfs_client, child) assert not file_exists(agfs_client, parent) - def test_write_new_rollback(self, agfs_client, test_dir): + async def test_write_new_rollback(self, agfs_client, test_dir): """New file → rollback → file deleted.""" file_path = f"{test_dir}/new-file.txt" agfs_client.write(file_path, b"new content") @@ -107,11 +107,11 @@ def test_write_new_rollback(self, agfs_client, test_dir): sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True ), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) assert not file_exists(agfs_client, file_path) - def test_best_effort_continues(self, agfs_client, test_dir): + async def test_best_effort_continues(self, agfs_client, test_dir): """If one step fails, subsequent steps still execute.""" real_dir = f"{test_dir}/best-effort-real" _mkdir_ok(agfs_client, real_dir) @@ -127,12 +127,12 @@ def test_best_effort_continues(self, agfs_client, test_dir): completed=True, ), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # seq=0 still executed despite seq=1 failure (reversed order: 1 runs first, then 0) assert not file_exists(agfs_client, real_dir) - def test_unknown_op_type_no_crash(self, agfs_client, test_dir): + async def test_unknown_op_type_no_crash(self, agfs_client, test_dir): """Unknown op_type is logged but doesn't raise.""" undo_log = [ UndoEntry( @@ -143,7 +143,7 @@ def test_unknown_op_type_no_crash(self, agfs_client, test_dir): ), ] # Should not raise - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) class TestVectorDBRollback: @@ -186,7 +186,7 @@ async def test_vectordb_delete_rollback_restores(self, agfs_client, vector_store completed=True, ), ] - execute_rollback(undo_log, agfs_client, vector_store=vector_store) + await execute_rollback(undo_log, agfs_client, vector_store=vector_store) results = await vector_store.get([record_id], ctx=request_ctx) assert len(results) == 1 @@ -233,7 +233,7 @@ async def test_vectordb_delete_multi_record(self, agfs_client, vector_store, req completed=True, ), ] - execute_rollback(undo_log, agfs_client, vector_store=vector_store) + await execute_rollback(undo_log, agfs_client, vector_store=vector_store) results = await vector_store.get(ids, ctx=request_ctx) assert len(results) == 3 @@ -255,7 +255,7 @@ async def test_vectordb_delete_empty_snapshot(self, agfs_client, vector_store, r ), ] # Should not raise - execute_rollback(undo_log, agfs_client, vector_store=vector_store) + await execute_rollback(undo_log, agfs_client, vector_store=vector_store) async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, request_ctx): """upsert → rollback(vectordb_upsert) → record deleted.""" @@ -288,7 +288,7 @@ async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, completed=True, ), ] - execute_rollback(undo_log, agfs_client, vector_store=vector_store) + await execute_rollback(undo_log, agfs_client, vector_store=vector_store) results = await vector_store.get([record_id], ctx=request_ctx) assert len(results) == 0 diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py index 1a68fe6a..aff57887 100644 --- a/tests/transaction/test_undo.py +++ b/tests/transaction/test_undo.py @@ -39,7 +39,7 @@ def test_roundtrip(self): class TestExecuteRollback: """Integration tests for execute_rollback using real AGFS and VectorDB backends.""" - def test_rollback_fs_mv(self, agfs_client, test_dir): + async def test_rollback_fs_mv(self, agfs_client, test_dir): src = f"{test_dir}/src" dst = f"{test_dir}/dst" _mkdir_ok(agfs_client, src) @@ -58,25 +58,25 @@ def test_rollback_fs_mv(self, agfs_client, test_dir): completed=True, ), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # src restored, dst gone assert file_exists(agfs_client, src) assert not file_exists(agfs_client, dst) - def test_rollback_fs_rm_skipped(self, agfs_client, test_dir): + async def test_rollback_fs_rm_skipped(self, agfs_client, test_dir): path = f"{test_dir}/will-not-delete" _mkdir_ok(agfs_client, path) undo_log = [ UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # fs_rm rollback is a no-op; directory still exists assert file_exists(agfs_client, path) - def test_rollback_fs_mkdir(self, agfs_client, test_dir): + async def test_rollback_fs_mkdir(self, agfs_client, test_dir): new_dir = f"{test_dir}/created" _mkdir_ok(agfs_client, new_dir) assert file_exists(agfs_client, new_dir) @@ -84,11 +84,11 @@ def test_rollback_fs_mkdir(self, agfs_client, test_dir): undo_log = [ UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=True), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) assert not file_exists(agfs_client, new_dir) - def test_rollback_fs_write_new(self, agfs_client, test_dir): + async def test_rollback_fs_write_new(self, agfs_client, test_dir): file_path = f"{test_dir}/new-file.txt" agfs_client.write(file_path, b"content") assert file_exists(agfs_client, file_path) @@ -98,11 +98,11 @@ def test_rollback_fs_write_new(self, agfs_client, test_dir): sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True ), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) assert not file_exists(agfs_client, file_path) - def test_rollback_reverse_order(self, agfs_client, test_dir): + async def test_rollback_reverse_order(self, agfs_client, test_dir): """mkdir parent + child → rollback → both removed in reverse order.""" parent = f"{test_dir}/parent" child = f"{test_dir}/parent/child" @@ -113,25 +113,25 @@ def test_rollback_reverse_order(self, agfs_client, test_dir): UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True), UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # child removed first (seq=1), then parent (seq=0) assert not file_exists(agfs_client, child) assert not file_exists(agfs_client, parent) - def test_rollback_skips_incomplete(self, agfs_client, test_dir): + async def test_rollback_skips_incomplete(self, agfs_client, test_dir): new_dir = f"{test_dir}/incomplete" _mkdir_ok(agfs_client, new_dir) undo_log = [ UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), ] - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # completed=False → not rolled back assert file_exists(agfs_client, new_dir) - def test_rollback_best_effort(self, agfs_client, test_dir): + async def test_rollback_best_effort(self, agfs_client, test_dir): """A failing rollback entry should not prevent others from running.""" real_dir = f"{test_dir}/real-dir" _mkdir_ok(agfs_client, real_dir) @@ -152,7 +152,7 @@ def test_rollback_best_effort(self, agfs_client, test_dir): ), ] # Should not raise - execute_rollback(undo_log, agfs_client) + await execute_rollback(undo_log, agfs_client) # seq=0 mv rollback should have executed (dst → src) assert file_exists(agfs_client, src) @@ -191,7 +191,7 @@ async def test_rollback_vectordb_upsert(self, agfs_client, vector_store, request completed=True, ), ] - execute_rollback(undo_log, agfs_client, vector_store=vector_store) + await execute_rollback(undo_log, agfs_client, vector_store=vector_store) results = await vector_store.get([record_id], ctx=request_ctx) assert len(results) == 0 @@ -242,7 +242,7 @@ async def test_rollback_vectordb_update_uri(self, agfs_client, vector_store, req completed=True, ), ] - execute_rollback(undo_log, agfs_client, vector_store=vector_store) + await execute_rollback(undo_log, agfs_client, vector_store=vector_store) # URI should be restored to old_uri result = await vector_store.fetch_by_uri(old_uri, ctx=request_ctx) From 74cba6b6e17d5c3419983c0abca77f431a045e3a Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 18:11:00 +0800 Subject: [PATCH 08/18] fix: tests --- .../storage/viking_vector_index_backend.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index d7a0ebe8..fd3b6e11 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -851,13 +851,21 @@ async def update_uri_mapping( async def increment_active_count(self, ctx: RequestContext, uris: List[str]) -> int: updated = 0 for uri in uris: - records = await self.get_context_by_uri(uri=uri, limit=1, ctx=ctx) + records = await self.get_context_by_uri(uri=uri, limit=100, ctx=ctx) if not records: continue - record = records[0] - current = int(record.get("active_count", 0) or 0) - record["active_count"] = current + 1 - if await self.upsert(record, ctx=ctx): + record_ids = [r["id"] for r in records if r.get("id")] + if not record_ids: + continue + # Re-fetch by ID to get full records including vectors + full_records = await self.get(record_ids, ctx=ctx) + uri_updated = False + for record in full_records: + current = int(record.get("active_count", 0) or 0) + record["active_count"] = current + 1 + if await self.upsert(record, ctx=ctx): + uri_updated = True + if uri_updated: updated += 1 return updated From c460ac6bccea8c6096759203af2e767d6ef2accc Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 21:15:24 +0800 Subject: [PATCH 09/18] refactor(transaction): simplify session commit and add redo-based crash recovery Session commit no longer wraps archive phase in a transaction. Phase 2 uses redo semantics so crashed memory-extraction can be replayed from archive. PathLock stale-lock cleanup no longer redundantly re-checks timeout. Semantic processor vectorization runs concurrently via asyncio.gather. --- openviking/session/session.py | 94 +++++++------ .../storage/queuefs/semantic_processor.py | 51 ++----- .../storage/transaction/context_manager.py | 10 +- openviking/storage/transaction/path_lock.py | 26 ---- .../transaction/transaction_manager.py | 132 ++++++++++++++++-- .../storage/viking_vector_index_backend.py | 2 +- 6 files changed, 192 insertions(+), 123 deletions(-) diff --git a/openviking/session/session.py b/openviking/session/session.py index b861deea..5726b8e3 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -224,11 +224,10 @@ def commit(self) -> Dict[str, Any]: return run_async(self.commit_async()) async def commit_async(self) -> Dict[str, Any]: - """Async commit session: two-phase transaction with checkpoint. + """Async commit session: two-phase approach. - Phase 1 (Archive): Lock session, write archive, clear messages, write checkpoint. - LLM call (no transaction): Extract long-term memories. - Phase 2 (Memory): Lock session, write memories + relations, update checkpoint. + Phase 1 (Archive, no transaction): Write archive, clear messages. + Phase 2 (Memory, transaction with redo semantics): Extract memories, write, enqueue. """ from openviking.storage.transaction import TransactionContext, get_transaction_manager @@ -245,9 +244,8 @@ async def commit_async(self) -> Dict[str, Any]: return result tx_manager = get_transaction_manager() - session_path = self._viking_fs._uri_to_path(self._session_uri, ctx=self.ctx) - # ===== Phase 1: Archive ===== + # ===== Preparation (no transaction) ===== self._compression.compression_index += 1 messages_to_archive = self._messages.copy() @@ -255,58 +253,62 @@ async def commit_async(self) -> Dict[str, Any]: archive_abstract = self._extract_abstract_from_summary(summary) archive_overview = summary - async with TransactionContext( - tx_manager, "session_archive", [session_path], lock_mode="point" - ) as tx: - archive_uri = ( - f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}" - ) - archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx) - seq = tx.record_undo("fs_write_new", {"uri": archive_path}) - await self._write_archive_async( - index=self._compression.compression_index, - messages=messages_to_archive, - abstract=archive_abstract, - overview=archive_overview, - ) - await self._write_to_agfs_async(messages=[]) - await self._write_checkpoint_async( - {"status": "archived", "archive_index": self._compression.compression_index} - ) - tx.mark_completed(seq) - await tx.commit() + # ===== Phase 1: Archive (no transaction, no lock) ===== + archive_uri = ( + f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}" + ) + await self._write_archive_async( + index=self._compression.compression_index, + messages=messages_to_archive, + abstract=archive_abstract, + overview=archive_overview, + ) + await self._write_to_agfs_async(messages=[]) + self._messages.clear() self._compression.original_count += len(messages_to_archive) result["archived"] = True - self._messages.clear() logger.info( f"Archived: {len(messages_to_archive)} messages → " f"history/archive_{self._compression.compression_index:03d}/" ) - # ===== LLM call (no transaction) ===== - if self._session_compressor: - logger.info( - f"Starting memory extraction from {len(messages_to_archive)} archived messages" - ) - memories = await self._session_compressor.extract_long_term_memories( - messages=messages_to_archive, - user=self.user, - session_id=self.session_id, - ctx=self.ctx, - ) - logger.info(f"Extracted {len(memories)} memories") - result["memories_extracted"] = len(memories) - self._stats.memories_extracted += len(memories) - get_current_telemetry().set("memory.extracted", len(memories)) - - # ===== Phase 2: Memory write ===== + # ===== Phase 2: Memory extraction + write (transaction, redo semantics) ===== async with TransactionContext( - tx_manager, "session_memory", [session_path], lock_mode="point" + tx_manager, + "session_memory", + [], + lock_mode="none", ) as tx: + # Store redo info so _recover_one can redo from archive on crash + tx.record.init_info.update( + { + "archive_uri": archive_uri, + "session_uri": self._session_uri, + "account_id": self.ctx.account_id, + "user_id": self.ctx.user.user_id, + "agent_id": self.ctx.user.agent_id, + "role": self.ctx.role.value, + } + ) + + if self._session_compressor: + logger.info( + f"Starting memory extraction from {len(messages_to_archive)} archived messages" + ) + memories = await self._session_compressor.extract_long_term_memories( + messages=messages_to_archive, + user=self.user, + session_id=self.session_id, + ctx=self.ctx, + ) + logger.info(f"Extracted {len(memories)} memories") + result["memories_extracted"] = len(memories) + self._stats.memories_extracted += len(memories) + get_current_telemetry().set("memory.extracted", len(memories)) + await self._write_to_agfs_async(self._messages) await self._write_relations_async() - await self._write_checkpoint_async({"status": "completed"}) tx.add_post_action( "enqueue_semantic", { diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index e7e108f2..c7d93eda 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -343,10 +343,8 @@ async def _process_single_directory( # 1. Collect .abstract.md from subdirectories children_abstracts = await self._collect_children_abstracts(children_uris) - # 2. Generate file summaries (vectorize inline, not via enqueue) - file_summaries = await self._generate_file_summaries( - file_paths, context_type=context_type, parent_uri=uri, enqueue_files=False - ) + # 2. Concurrently generate summaries for files in directory + file_summaries = await self._generate_file_summaries(file_paths) # 3. Generate .overview.md overview = await self._generate_overview(uri, file_summaries, children_abstracts) @@ -360,22 +358,23 @@ async def _process_single_directory( logger.debug(f"Generated overview and abstract for {uri}") - # 6. Vectorize directory and files (all inside the lock) - try: - await self._vectorize_directory_simple(uri, context_type, abstract, overview) - except Exception as e: - logger.error(f"Failed to vectorize directory {uri}: {e}", exc_info=True) - - for fp, summary in zip(file_paths, file_summaries): - try: - await self._vectorize_single_file( + # 6. Vectorize directory and files concurrently + vectorize_tasks = [ + self._vectorize_directory_simple(uri, context_type, abstract, overview), + *( + self._vectorize_single_file( parent_uri=uri, context_type=context_type, file_path=fp, summary_dict=summary, ) - except Exception as e: - logger.error(f"Failed to vectorize file {fp}: {e}", exc_info=True) + for fp, summary in zip(file_paths, file_summaries) + ), + ] + results = await asyncio.gather(*vectorize_tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.error(f"Vectorization failed: {result}", exc_info=True) await tx.commit() except LockAcquisitionError: @@ -395,32 +394,12 @@ async def _collect_children_abstracts(self, children_uris: List[str]) -> List[Di async def _generate_file_summaries( self, file_paths: List[str], - context_type: Optional[str] = None, - parent_uri: Optional[str] = None, - enqueue_files: bool = False, ) -> List[Dict[str, str]]: """Concurrently generate file summaries.""" if not file_paths: return [] - async def generate_one_summary(file_path: str) -> Dict[str, str]: - summary = await self._generate_single_file_summary(file_path, ctx=self._current_ctx) - if enqueue_files and context_type and parent_uri: - try: - await self._vectorize_single_file( - parent_uri=parent_uri, - context_type=context_type, - file_path=file_path, - summary_dict=summary, - ) - except Exception as e: - logger.error( - f"Failed to vectorize file {file_path}: {e}", - exc_info=True, - ) - return summary - - tasks = [generate_one_summary(fp) for fp in file_paths] + tasks = [self._generate_single_file_summary(fp, ctx=self._current_ctx) for fp in file_paths] return await asyncio.gather(*tasks) async def _generate_text_summary( diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py index 8272b91c..09697e10 100644 --- a/openviking/storage/transaction/context_manager.py +++ b/openviking/storage/transaction/context_manager.py @@ -74,7 +74,15 @@ async def __aenter__(self) -> "TransactionContext": logger.warning(f"[Transaction] Failed to write journal for {tx_id}: {e}") success = False - if self._lock_mode == "subtree": + if self._lock_mode == "none": + # No lock acquisition — transition directly to EXEC status + tx = self._tx_manager.get_transaction(tx_id) + if tx: + from openviking.storage.transaction.transaction_record import TransactionStatus + + tx.update_status(TransactionStatus.EXEC) + success = True + elif self._lock_mode == "subtree": for path in self._lock_paths: success = await self._tx_manager.acquire_lock_subtree(tx_id, path) if not success: diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index 8b412a67..1097b0da 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -172,10 +172,6 @@ async def acquire_point( if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[POINT] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) - if asyncio.get_running_loop().time() >= deadline: - logger.warning(f"[POINT] Timeout waiting for lock on: {path}") - return False - await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_running_loop().time() >= deadline: logger.warning(f"[POINT] Timeout waiting for lock on: {path}") @@ -190,12 +186,6 @@ async def acquire_point( f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" ) await self._remove_lock_file(ancestor_conflict) - if asyncio.get_running_loop().time() >= deadline: - logger.warning( - f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" - ) - return False - await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_running_loop().time() >= deadline: logger.warning( @@ -261,10 +251,6 @@ async def acquire_subtree( if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) - if asyncio.get_running_loop().time() >= deadline: - logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") - return False - await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_running_loop().time() >= deadline: logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}") @@ -280,12 +266,6 @@ async def acquire_subtree( f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}" ) await self._remove_lock_file(ancestor_conflict) - if asyncio.get_running_loop().time() >= deadline: - logger.warning( - f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}" - ) - return False - await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_running_loop().time() >= deadline: logger.warning( @@ -300,12 +280,6 @@ async def acquire_subtree( if self.is_lock_stale(desc_conflict, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}") await self._remove_lock_file(desc_conflict) - if asyncio.get_running_loop().time() >= deadline: - logger.warning( - f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}" - ) - return False - await asyncio.sleep(_POLL_INTERVAL) continue if asyncio.get_running_loop().time() >= deadline: logger.warning( diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index 379684a6..cb395432 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -184,7 +184,8 @@ async def _recover_one(self, tx_id: str) -> None: Recovery strategy by status: COMMITTED + post_actions → replay post_actions (enqueue etc.), then clean up COMMITTED, no post_actions / RELEASED → just clean up - EXEC / FAIL / RELEASING → rollback completed+partial ops, then clean up + EXEC / FAIL / RELEASING, all ops completed → roll forward (commit), then clean up + EXEC / FAIL / RELEASING, partial ops → rollback completed+partial ops, then clean up INIT / ACQUIRE → nothing executed yet, just clean up """ from openviking.storage.transaction.undo import execute_rollback @@ -217,18 +218,43 @@ async def _recover_one(self, tx_id: str) -> None: if not tx.locks: await self._cleanup_orphan_locks_from_init_info(tx_id, tx.init_info) else: - # EXEC / FAIL / RELEASING: process crashed mid-operation — rollback - # Pass recover_all=True so partial (completed=False) ops are also reversed, - # e.g. a directory mv that started but never finished still leaves residue. - try: - await execute_rollback( - tx.undo_log, - self._agfs, - vector_store=self._vector_store, - recover_all=True, - ) - except Exception as e: - logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}") + # EXEC / FAIL / RELEASING: process crashed mid-operation + operation = tx.init_info.get("operation", "") + if operation == "session_memory": + # Redo: re-extract memories from archive and write + try: + await self._redo_session_memory(tx) + except Exception as e: + logger.warning(f"Redo session_memory failed for tx {tx_id}: {e}") + elif ( + tx.status == TransactionStatus.EXEC + and tx.undo_log + and all(e.completed for e in tx.undo_log) + ): + # All operations completed successfully but commit didn't persist. + # Roll forward: treat as committed to avoid data loss from rollback + # of irreversible operations (e.g. mv's fs_rm). + logger.info(f"All ops completed for tx {tx_id}, rolling forward (commit)") + if tx.post_actions: + try: + await self._execute_post_actions(tx.post_actions) + except Exception as e: + logger.warning( + f"Post-action replay during roll-forward failed for tx {tx_id}: {e}" + ) + else: + # Default: rollback completed+partial ops + # Pass recover_all=True so partial (completed=False) ops are also reversed, + # e.g. a directory mv that started but never finished still leaves residue. + try: + await execute_rollback( + tx.undo_log, + self._agfs, + vector_store=self._vector_store, + recover_all=True, + ) + except Exception as e: + logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}") # Release any lock files still present await self._path_lock.release(tx) @@ -275,6 +301,86 @@ async def _cleanup_orphan_locks_from_init_info( except Exception as e: logger.warning(f"Failed to check orphan lock {lock_file}: {e}") + async def _redo_session_memory(self, tx: TransactionRecord) -> None: + """Redo a session_memory transaction from its archived messages. + + On crash during Phase 2 of session commit, we redo memory extraction + from the archive rather than rolling back. + """ + import json + + from openviking.message import Message + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + + archive_uri = tx.init_info.get("archive_uri") + session_uri = tx.init_info.get("session_uri") + account_id = tx.init_info.get("account_id", "default") + user_id = tx.init_info.get("user_id", "default") + agent_id = tx.init_info.get("agent_id", "default") + role_str = tx.init_info.get("role", "root") + + if not archive_uri or not session_uri: + logger.warning("Cannot redo session_memory: missing archive_uri or session_uri") + return + + # 1. Read archived messages from AGFS + messages_path = f"{archive_uri}/messages.jsonl" + try: + agfs_path = messages_path.replace("viking://", "") + content = self._agfs.cat(agfs_path) + if isinstance(content, bytes): + content = content.decode("utf-8") + except Exception as e: + logger.warning(f"Cannot read archive for redo: {messages_path}: {e}") + return + + messages = [] + for line in content.strip().split("\n"): + if line.strip(): + try: + messages.append(Message.from_dict(json.loads(line))) + except Exception: + pass + + if not messages: + logger.warning(f"No messages found in archive for redo: {archive_uri}") + return + + # 2. Build request context for memory extraction + user = UserIdentifier(user_id=user_id, agent_id=agent_id) + ctx = RequestContext(user=user, role=Role(role_str), account_id=account_id) + + # 3. Re-extract memories + from openviking.session.compressor import SessionCompressor + + compressor = SessionCompressor() + session_id = session_uri.rstrip("/").rsplit("/", 1)[-1] + memories = await compressor.extract_long_term_memories( + messages=messages, + user=user, + session_id=session_id, + ctx=ctx, + ) + logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}") + + # 4. Enqueue semantic processing + await self._execute_post_actions( + [ + { + "type": "enqueue_semantic", + "params": { + "uri": session_uri, + "context_type": "memory", + "account_id": account_id, + "user_id": user_id, + "agent_id": agent_id, + "role": role_str, + }, + } + ] + ) + def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord: """Create a new transaction. diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py index fd3b6e11..9bdf5976 100644 --- a/openviking/storage/viking_vector_index_backend.py +++ b/openviking/storage/viking_vector_index_backend.py @@ -834,7 +834,7 @@ async def update_uri_mapping( # so fetch and update all of them. records = await self.filter( filter=And([Eq("uri", uri), Eq("account_id", ctx.account_id)]), - limit=1, + limit=100, ctx=ctx, ) if not records: From e494f38fe9e54c25479a886ec6e3178258e38dfc Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Mon, 16 Mar 2026 21:47:07 +0800 Subject: [PATCH 10/18] fix: transaction --- docs/en/concepts/09-transaction.md | 50 +++++++++++-------- docs/zh/concepts/09-transaction.md | 50 +++++++++++-------- .../transaction/transaction_manager.py | 19 +------ 3 files changed, 61 insertions(+), 58 deletions(-) diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md index 48e6d355..53e1a4a1 100644 --- a/docs/en/concepts/09-transaction.md +++ b/docs/en/concepts/09-transaction.md @@ -97,29 +97,36 @@ Crash recovery: Journal records the post_action; replayed automatically on resta | Problem | Solution | |---------|----------| -| Messages cleared but archive not written -> conversation data lost | Split into two transactions + checkpoint | +| Messages cleared but archive not written -> conversation data lost | Phase 1 without transaction (incomplete archive has no side effects) + Phase 2 with redo transaction | -LLM calls have unpredictable latency (5s~60s+), so they cannot be inside a transaction. Split into: +LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding transaction. The design splits into two phases: ``` -Transaction 1 (Archive): - 1. Write archive (history/archive_N/messages.jsonl + summaries) - 2. Clear messages.jsonl - 3. Write checkpoint (status="archived") - 4. Commit - -LLM call (no transaction): - Extract memories from archived messages - -Transaction 2 (Memory write): - 1. Write memory files - 2. Write relations - 3. Update checkpoint (status="completed") - 4. Register post_action: enqueue SemanticQueue - 5. Commit +Phase 1 — Archive (no transaction, no lock): + 1. Generate archive summary (LLM) + 2. Write archive (history/archive_N/messages.jsonl + summaries) + 3. Clear messages.jsonl + 4. Clear in-memory message list + +Phase 2 — Memory extraction + write (transaction, lock_mode="none", redo semantics): + 1. Record init_info (archive_uri, session_uri, user identity) + 2. Extract memories from archived messages (LLM) + 3. Write current message state + 4. Write relations + 5. Register post_action: enqueue SemanticQueue + 6. Commit ``` -Crash recovery: Read checkpoint, resume from the appropriate step based on status. +**Redo semantics**: Phase 2 does not register undo log entries. On crash recovery, memory extraction and writing are re-executed from the archive (`_redo_session_memory`) instead of being rolled back. + +**Crash recovery analysis**: + +| Crash point | State | Recovery action | +|------------|-------|----------------| +| During Phase 1 archive write | No transaction | Incomplete archive; next commit scans history/ for index, unaffected | +| Phase 1 archive complete but messages not cleared | No transaction | Archive complete + messages still present = redundant but safe | +| During Phase 2 memory extraction/write | Journal EXEC | On startup: `_redo_session_memory` redoes extraction + write + enqueue from archive | +| After Phase 2 commit | Journal COMMIT | On startup: replay `post_action("enqueue_semantic")` | ## TransactionContext @@ -153,6 +160,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as | `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors | | `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors | | `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) | +| `none` | Lock-free operations | Skip lock acquisition, transition directly to EXEC status. Used for session.commit Phase 2 and other scenarios that don't require path mutual exclusion | ## Lock Types (POINT vs SUBTREE) @@ -288,7 +296,9 @@ Rollback -> execute undo log -> release locks -> delete journal |------------------------|----------------| | `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal | | `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal | -| `EXEC` / `FAIL` / `RELEASING` | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal | +| `EXEC` / `FAIL` / `RELEASING` (`session_memory` operation) | Redo memory extraction + write from archive (`_redo_session_memory`) -> release locks -> delete journal | +| `EXEC` / `FAIL` / `RELEASING` (all undo entries completed) | Roll forward (treat as committed, replay post_actions) -> release locks -> delete journal | +| `EXEC` / `FAIL` / `RELEASING` (other) | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal | | `INIT` / `ACQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) | ### Defense Summary @@ -298,7 +308,7 @@ Rollback -> execute undo log -> release locks -> delete journal | Crash during transaction | Journal + undo log rollback | On restart | | Crash after commit, before enqueue | Journal post_actions replay | On restart | | Crash after enqueue, before worker processes | QueueFS SQLite persistence | Worker auto-pulls after restart | -| Crash during session.commit LLM call | Checkpoint file recovery | On restart, re-invoke LLM | +| Crash during session.commit Phase 2 | Journal + redo (re-extract memories from archive) | On restart | | Orphan index | Cleaned on L2 on-demand load | When user accesses | | Crash between lock creation and journal update | init_info records intended lock paths; recovery checks and cleans orphan locks | On restart | diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index e4f1b8d0..ccd98ffc 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -97,29 +97,36 @@ Storage Layer (VikingFS, VectorDB, QueueManager) | 问题 | 方案 | |------|------| -| 消息已清空但 archive 未写入 → 对话数据丢失 | 拆为两段事务 + checkpoint | +| 消息已清空但 archive 未写入 → 对话数据丢失 | Phase 1 无事务(archive 不完整无副作用)+ Phase 2 redo 事务 | -LLM 调用耗时不可控(5s~60s+),放在事务内会长时间持锁。因此拆为: +LLM 调用耗时不可控(5s~60s+),不能放在持锁事务内。设计拆为两个阶段: ``` -第一段事务(归档): - 1. 写 archive(history/archive_N/messages.jsonl + 摘要) - 2. 清空 messages.jsonl - 3. 写 checkpoint(status="archived") - 4. 提交 - -LLM 调用(无事务): - 从归档消息提取 memories - -第二段事务(memory 写入): - 1. 写 memory 文件 - 2. 写 relations - 3. 更新 checkpoint(status="completed") - 4. 注册 post_action: enqueue SemanticQueue - 5. 提交 +Phase 1 — 归档(无事务、无锁): + 1. 生成归档摘要(LLM) + 2. 写 archive(history/archive_N/messages.jsonl + 摘要) + 3. 清空 messages.jsonl + 4. 清空内存中的消息列表 + +Phase 2 — 记忆提取 + 写入(事务,lock_mode="none",redo 语义): + 1. 记录 init_info(archive_uri、session_uri、用户身份信息) + 2. 从归档消息提取 memories(LLM) + 3. 写当前消息状态 + 4. 写 relations + 5. 注册 post_action: enqueue SemanticQueue + 6. 提交 ``` -崩溃恢复:读 checkpoint,根据 status 决定从哪一步继续。 +**Redo 语义**:Phase 2 不注册 undo log。崩溃恢复时从 archive 重新执行记忆提取和写入(`_redo_session_memory`),而非回滚。 + +**崩溃恢复分析**: + +| 崩溃时间点 | 状态 | 恢复动作 | +|-----------|------|---------| +| Phase 1 写 archive 中途 | 无事务 | archive 不完整,下次 commit 从 history/ 扫描 index,不受影响 | +| Phase 1 archive 完成但 messages 未清空 | 无事务 | archive 完整 + messages 仍在 = 数据冗余但安全 | +| Phase 2 记忆提取/写入中途 | journal EXEC | 启动恢复:`_redo_session_memory` 从 archive 重做提取+写入+入队 | +| Phase 2 commit 后 | journal COMMIT | 启动恢复:重放 `post_action("enqueue_semantic")` | ## TransactionContext @@ -153,6 +160,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as | `point` | 写操作 | 锁定指定路径;与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 | | `subtree` | 删除操作 | 锁定子树根节点;与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 | | `mv` | 移动操作 | 目录移动:源和目标均加 SUBTREE 锁;文件移动:源父目录和目标均加 POINT 锁(通过 `src_is_dir` 控制) | +| `none` | 无锁操作 | 跳过锁获取,直接进入 EXEC 状态。用于 session.commit Phase 2 等不需要路径互斥的场景 | ## 锁类型(POINT vs SUBTREE) @@ -288,7 +296,9 @@ VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、age |---------------------|---------| | `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal | | `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal | -| `EXEC` / `FAIL` / `RELEASING` | 执行 undo log 回滚(`recover_all=True`) → 删锁 → 删 journal | +| `EXEC` / `FAIL` / `RELEASING`(`session_memory` 操作) | 从 archive 重做记忆提取+写入(`_redo_session_memory`) → 删锁 → 删 journal | +| `EXEC` / `FAIL` / `RELEASING`(所有 undo 均 completed) | 前滚(视为已提交,重放 post_actions) → 删锁 → 删 journal | +| `EXEC` / `FAIL` / `RELEASING`(其他) | 执行 undo log 回滚(`recover_all=True`) → 删锁 → 删 journal | | `INIT` / `ACQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal(变更未执行) | ### 防线总结 @@ -298,7 +308,7 @@ VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、age | 事务内崩溃 | journal + undo log 回滚 | 重启时 | | 提交后 enqueue 前崩溃 | journal post_actions 重放 | 重启时 | | enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后自动拉取 | -| session.commit LLM 调用中崩溃 | checkpoint 文件恢复 | 重启时重新调用 LLM | +| session.commit Phase 2 中崩溃 | journal + redo(从 archive 重做记忆提取) | 重启时 | | 孤儿索引 | L2 按需加载时清理 | 用户访问时 | | 加锁后 journal 更新前崩溃 | init_info 记录预期锁路径,恢复时检查并清理孤儿锁 | 重启时 | diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index cb395432..041b8423 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -184,8 +184,7 @@ async def _recover_one(self, tx_id: str) -> None: Recovery strategy by status: COMMITTED + post_actions → replay post_actions (enqueue etc.), then clean up COMMITTED, no post_actions / RELEASED → just clean up - EXEC / FAIL / RELEASING, all ops completed → roll forward (commit), then clean up - EXEC / FAIL / RELEASING, partial ops → rollback completed+partial ops, then clean up + EXEC / FAIL / RELEASING → rollback completed+partial ops, then clean up INIT / ACQUIRE → nothing executed yet, just clean up """ from openviking.storage.transaction.undo import execute_rollback @@ -226,22 +225,6 @@ async def _recover_one(self, tx_id: str) -> None: await self._redo_session_memory(tx) except Exception as e: logger.warning(f"Redo session_memory failed for tx {tx_id}: {e}") - elif ( - tx.status == TransactionStatus.EXEC - and tx.undo_log - and all(e.completed for e in tx.undo_log) - ): - # All operations completed successfully but commit didn't persist. - # Roll forward: treat as committed to avoid data loss from rollback - # of irreversible operations (e.g. mv's fs_rm). - logger.info(f"All ops completed for tx {tx_id}, rolling forward (commit)") - if tx.post_actions: - try: - await self._execute_post_actions(tx.post_actions) - except Exception as e: - logger.warning( - f"Post-action replay during roll-forward failed for tx {tx_id}: {e}" - ) else: # Default: rollback completed+partial ops # Pass recover_all=True so partial (completed=False) ops are also reversed, From c61978f5df85cc823dc41211ac95ccd3ce8ccd55 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Tue, 17 Mar 2026 15:32:45 +0800 Subject: [PATCH 11/18] fix: UserIdentifier --- openviking/storage/transaction/transaction_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py index 041b8423..e80df09b 100644 --- a/openviking/storage/transaction/transaction_manager.py +++ b/openviking/storage/transaction/transaction_manager.py @@ -331,7 +331,7 @@ async def _redo_session_memory(self, tx: TransactionRecord) -> None: return # 2. Build request context for memory extraction - user = UserIdentifier(user_id=user_id, agent_id=agent_id) + user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id) ctx = RequestContext(user=user, role=Role(role_str), account_id=account_id) # 3. Re-extract memories From 4e44a5d6163531bdeecd9a27854183ce17926bc7 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Tue, 17 Mar 2026 19:44:22 +0800 Subject: [PATCH 12/18] refactor(transaction): replace undo-based transaction manager with lightweight lock + redo-log Remove the heavyweight TransactionManager/Journal/UndoEntry system (~4000 lines) and replace it with a simpler architecture: LockManager for path locking, LockContext as the async context manager, LockHandle/LockOwner protocol, and a RedoLog for crash recovery of session_memory operations. VikingFS rm/mv now use inline error handling instead of rollback semantics. Updated docs, observers, and tests accordingly. Co-Authored-By: Claude Opus 4.6 --- docs/design/multi-tenant-design.md | 2 +- docs/en/concepts/09-transaction.md | 306 ++++---- docs/en/guides/01-configuration.md | 11 +- docs/zh/concepts/09-transaction.md | 341 ++++---- docs/zh/guides/01-configuration.md | 13 +- openviking/async_client.py | 6 +- openviking/server/routers/content.py | 26 +- openviking/server/routers/observer.py | 8 +- openviking/service/core.py | 32 +- openviking/service/debug_service.py | 18 +- openviking/session/session.py | 109 +-- openviking/storage/errors.py | 10 +- openviking/storage/observers/__init__.py | 4 +- openviking/storage/observers/lock_observer.py | 71 ++ .../storage/observers/transaction_observer.py | 222 ------ openviking/storage/queuefs/semantic_dag.py | 9 +- .../storage/queuefs/semantic_processor.py | 10 +- openviking/storage/transaction/__init__.py | 42 +- .../storage/transaction/context_manager.py | 159 ---- openviking/storage/transaction/journal.py | 113 --- .../storage/transaction/lock_context.py | 68 ++ openviking/storage/transaction/lock_handle.py | 37 + .../storage/transaction/lock_manager.py | 247 ++++++ openviking/storage/transaction/path_lock.py | 110 ++- openviking/storage/transaction/redo_log.py | 76 ++ .../transaction/transaction_manager.py | 739 ------------------ .../storage/transaction/transaction_record.py | 139 ---- openviking/storage/transaction/undo.py | 178 ----- openviking/storage/viking_fs.py | 119 +-- openviking/utils/resource_processor.py | 17 +- .../utils/config/transaction_config.py | 5 - tests/agfs/test_fs_binding.py | 8 +- tests/agfs/test_fs_binding_s3.py | 8 +- tests/agfs/test_fs_local.py | 8 +- tests/agfs/test_fs_s3.py | 8 +- tests/server/conftest.py | 8 +- tests/storage/test_semantic_dag_skip_files.py | 16 +- tests/storage/test_semantic_dag_stats.py | 16 +- tests/transaction/conftest.py | 19 +- tests/transaction/test_concurrent_lock.py | 12 +- tests/transaction/test_context_manager.py | 226 ------ tests/transaction/test_crash_recovery.py | 561 ------------- tests/transaction/test_e2e.py | 448 ++--------- tests/transaction/test_journal.py | 215 ----- tests/transaction/test_lock_context.py | 85 ++ tests/transaction/test_lock_manager.py | 88 +++ tests/transaction/test_path_lock.py | 52 +- tests/transaction/test_post_actions.py | 112 --- tests/transaction/test_redo_log.py | 78 ++ tests/transaction/test_rm_rollback.py | 294 ------- tests/transaction/test_transaction_manager.py | 323 -------- tests/transaction/test_undo.py | 249 ------ 52 files changed, 1408 insertions(+), 4673 deletions(-) create mode 100644 openviking/storage/observers/lock_observer.py delete mode 100644 openviking/storage/observers/transaction_observer.py delete mode 100644 openviking/storage/transaction/context_manager.py delete mode 100644 openviking/storage/transaction/journal.py create mode 100644 openviking/storage/transaction/lock_context.py create mode 100644 openviking/storage/transaction/lock_handle.py create mode 100644 openviking/storage/transaction/lock_manager.py create mode 100644 openviking/storage/transaction/redo_log.py delete mode 100644 openviking/storage/transaction/transaction_manager.py delete mode 100644 openviking/storage/transaction/transaction_record.py delete mode 100644 openviking/storage/transaction/undo.py delete mode 100644 tests/transaction/test_context_manager.py delete mode 100644 tests/transaction/test_crash_recovery.py delete mode 100644 tests/transaction/test_journal.py create mode 100644 tests/transaction/test_lock_context.py create mode 100644 tests/transaction/test_lock_manager.py delete mode 100644 tests/transaction/test_post_actions.py create mode 100644 tests/transaction/test_redo_log.py delete mode 100644 tests/transaction/test_rm_rollback.py delete mode 100644 tests/transaction/test_transaction_manager.py delete mode 100644 tests/transaction/test_undo.py diff --git a/docs/design/multi-tenant-design.md b/docs/design/multi-tenant-design.md index 6c5ad43d..9a131ac2 100644 --- a/docs/design/multi-tenant-design.md +++ b/docs/design/multi-tenant-design.md @@ -283,7 +283,7 @@ def agent_space_name(self) -> str: | `agent/instructions` | `/{account_id}/agent/{agent_space}/instructions/` | account + user + agent | agent 的行为规则,每用户独立 | | `resources/` | `/{account_id}/resources/` | account | account 内共享的知识资源 | | `session/` | `/{account_id}/session/{user_space}/{session_id}/` | account + user | 用户的对话记录 | -| `transactions/` | `/{account_id}/transactions/` | account | 账户级事务记录 | +| `redo/` | `/{account_id}/_system/redo/` | account | 崩溃恢复 redo 标记 | | `_system/`(全局) | `/_system/` | 系统级 | 全局工作区列表 | | `_system/`(per-account) | `/{account_id}/_system/` | account | 用户注册表 | diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md index 3d7ced04..1ada00dc 100644 --- a/docs/en/concepts/09-transaction.md +++ b/docs/en/concepts/09-transaction.md @@ -1,6 +1,6 @@ -# Transaction Mechanism +# Path Locks and Crash Recovery -OpenViking's transaction mechanism protects the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur. +OpenViking uses two simple primitives — **path locks** and **redo log** — to protect the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur. ## Design Philosophy @@ -10,11 +10,11 @@ OpenViking is a context database where FS is the source of truth and VectorDB is ## Design Principles -1. **Transactions cover synchronous operations only**: FS + VectorDB operations run inside transactions; SemanticQueue/EmbeddingQueue enqueue runs after commit (as post_actions) — they are idempotent and retriable -2. **On by default**: All data operations automatically use transactions; no extra configuration needed -3. **Write-exclusive**: Path locks ensure only one write transaction can operate on a path at a time -4. **Undo Log model**: Record reverse operations before each change; replay them in reverse order on failure -5. **Persistent journal**: Each transaction writes a journal file to AGFS for crash recovery +1. **Write-exclusive**: Path locks ensure only one write operation can operate on a path at a time +2. **On by default**: All data operations automatically acquire locks; no extra configuration needed +3. **Lock as protection**: LockContext acquires locks on entry, releases on exit — no undo/journal/commit semantics +4. **Only session_memory needs crash recovery**: RedoLog re-executes memory extraction after a process crash +5. **Queue operations run outside locks**: SemanticQueue/EmbeddingQueue enqueue operations are idempotent and retriable ## Architecture @@ -22,23 +22,63 @@ OpenViking is a context database where FS is the source of truth and VectorDB is Service Layer (rm / mv / add_resource / session.commit) | v -+--[TransactionContext async context manager]--+ -| | -| 1. Create transaction + write journal | -| 2. Acquire path lock (poll + timeout) | -| 3. Execute operations (FS + VectorDB) | -| 4. Record Undo Log (mark completed) | -| 5. Commit / Rollback | -| 6. Execute post_actions (enqueue etc) | -| 7. Release lock + clean up journal | -| | -| On exception: reverse Undo Log + unlock | -+----------------------------------------------+ ++--[LockContext async context manager]--+ +| | +| 1. Create LockHandle | +| 2. Acquire path lock (poll+timeout) | +| 3. Execute operations (FS+VectorDB) | +| 4. Release lock | +| | +| On exception: auto-release lock, | +| exception propagates unchanged | ++---------------------------------------+ | v Storage Layer (VikingFS, VectorDB, QueueManager) ``` +## Two Core Components + +### Component 1: PathLock + LockManager + LockContext (Path Lock System) + +**PathLock** implements file-based distributed locks with two lock types — POINT and SUBTREE — using fencing tokens to prevent TOCTOU races and automatic stale lock detection and cleanup. + +**LockHandle** is a lightweight lock holder token: + +```python +@dataclass +class LockHandle: + id: str # Unique ID used to generate fencing tokens + locks: list[str] # Acquired lock file paths + created_at: float # Creation time +``` + +**LockManager** is a global singleton managing lock lifecycle: +- Creates/releases LockHandles +- Background cleanup of leaked locks (in-process safety net) +- Executes RedoLog recovery on startup + +**LockContext** is an async context manager encapsulating the lock/unlock lifecycle: + +```python +from openviking.storage.transaction import LockContext, get_lock_manager + +async with LockContext(get_lock_manager(), [path], lock_mode="point") as handle: + # Perform operations under lock protection + ... +# Lock automatically released on exit (including exceptions) +``` + +### Component 2: RedoLog (Crash Recovery) + +Used only for the memory extraction phase of `session.commit`. Writes a marker before the operation, deletes it after success, and scans for leftover markers on startup to redo. + +``` +/local/_system/redo/{task_id}/redo.json +``` + +Memory extraction is idempotent — re-extracting from the same archive produces the same result. + ## Consistency Issues and Solutions ### rm(uri) @@ -47,134 +87,138 @@ Storage Layer (VikingFS, VectorDB, QueueManager) |---------|----------| | Delete file first, then index -> file gone but index remains -> search returns non-existent file | **Reverse order**: delete index first, then file. Index deletion failure -> both file and index intact | -Transaction flow: +**Locking strategy** (depends on target type): +- Deleting a **directory**: `lock_mode="subtree"`, locks the directory itself +- Deleting a **file**: `lock_mode="point"`, locks the file's parent directory + +Operation flow: ``` -1. Begin transaction, acquire lock (lock_mode="subtree") -2. Snapshot VectorDB records (for rollback recovery) +1. Check whether target is a directory or file, choose lock mode +2. Acquire lock 3. Delete VectorDB index -> immediately invisible to search 4. Delete FS file -5. Commit -> release lock -> delete journal +5. Release lock ``` -Rollback: Step 4 fails -> restore VectorDB records from snapshot. +VectorDB deletion fails -> exception thrown, lock auto-released, file and index both intact. FS deletion fails -> VectorDB already deleted but file remains, retry is safe. ### mv(old_uri, new_uri) | Problem | Solution | |---------|----------| -| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Transaction wrapper; rollback on failure | +| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Copy first then update index; clean up copy on failure | + +**Locking strategy** (handled automatically via `lock_mode="mv"`): +- Moving a **directory**: SUBTREE lock on both source path and destination parent +- Moving a **file**: POINT lock on both source's parent and destination parent -Transaction flow: +Operation flow: ``` -1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on both source and destination for directories) -2. Move FS file -3. Update VectorDB URIs -4. Commit -> release lock -> delete journal +1. Check whether source is a directory or file, set src_is_dir +2. Acquire mv lock (internally chooses SUBTREE or POINT based on src_is_dir) +3. Copy to new location (source still intact, safe) +4. If directory, remove the lock file carried over by cp into the copy +5. Update VectorDB URIs + - Failure -> clean up copy, source and old index intact, consistent state +6. Delete source +7. Release lock ``` -Rollback: Step 3 fails -> move file back to original location. - ### add_resource | Problem | Solution | |---------|----------| | File moved from temp to final directory, then crash -> file exists but never searchable | Two separate paths for first-time add vs incremental update | -First-time add and incremental update are two independent paths: - **First-time add** (target does not exist) — handled in `ResourceProcessor.process_resource` Phase 3.5: ``` -1. Begin transaction, lock parent_path of final_uri (lock_mode="point") -2. Record undo: fs_write_new (uri=dst_path) -3. agfs.mv temp directory -> final location -4. Commit -> release lock -> delete journal -5. Clean up temp directory -6. Enqueue SemanticMsg(uri=final, target_uri=None) -> DAG runs on final, no callback +1. Acquire lock on parent_path of final_uri (lock_mode="point") +2. agfs.mv temp directory -> final location +3. Release lock +4. Clean up temp directory +5. Enqueue SemanticMsg -> DAG runs on final ``` -Crash recovery: Undo deletes the incomplete dst_path; re-run `add_resource` to retry. - **Incremental update** (target already exists) — temp stays in place: ``` 1. Enqueue SemanticMsg(uri=temp, target_uri=final) -> DAG runs on temp 2. DAG completion triggers sync_diff_callback or move_temp_to_target_callback -3. Each VikingFS.rm / VikingFS.mv inside callbacks creates its own independent transaction +3. Each VikingFS.rm / VikingFS.mv inside callbacks acquires its own lock ``` -Note: DAG callbacks do NOT wrap operations in an outer TransactionContext. Each `VikingFS.rm` and `VikingFS.mv` has its own transaction internally. An outer lock would conflict with these inner locks (e.g. outer POINT lock on target_path vs inner SUBTREE lock from `rm`) causing deadlock. +Note: DAG callbacks do NOT wrap operations in an outer lock. Each `VikingFS.rm` and `VikingFS.mv` has its own lock internally. An outer lock would conflict with these inner locks causing deadlock. ### session.commit() | Problem | Solution | |---------|----------| -| Messages cleared but archive not written -> conversation data lost | Phase 1 without transaction (incomplete archive has no side effects) + Phase 2 with redo transaction | +| Messages cleared but archive not written -> conversation data lost | Phase 1 without lock (incomplete archive has no side effects) + Phase 2 with RedoLog | -LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding transaction. The design splits into two phases: +LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding operation. The design splits into two phases: ``` -Phase 1 — Archive (no transaction, no lock): +Phase 1 — Archive (no lock): 1. Generate archive summary (LLM) 2. Write archive (history/archive_N/messages.jsonl + summaries) 3. Clear messages.jsonl 4. Clear in-memory message list -Phase 2 — Memory extraction + write (transaction, lock_mode="none", redo semantics): - 1. Record init_info (archive_uri, session_uri, user identity) +Phase 2 — Memory extraction + write (RedoLog): + 1. Write redo marker (archive_uri, session_uri, user identity) 2. Extract memories from archived messages (LLM) 3. Write current message state 4. Write relations - 5. Register post_action: enqueue SemanticQueue - 6. Commit + 5. Directly enqueue SemanticQueue + 6. Delete redo marker ``` -**Redo semantics**: Phase 2 does not register undo log entries. On crash recovery, memory extraction and writing are re-executed from the archive (`_redo_session_memory`) instead of being rolled back. - **Crash recovery analysis**: | Crash point | State | Recovery action | |------------|-------|----------------| -| During Phase 1 archive write | No transaction | Incomplete archive; next commit scans history/ for index, unaffected | -| Phase 1 archive complete but messages not cleared | No transaction | Archive complete + messages still present = redundant but safe | -| During Phase 2 memory extraction/write | Journal EXEC | On startup: `_redo_session_memory` redoes extraction + write + enqueue from archive | -| After Phase 2 commit | Journal COMMIT | On startup: replay `post_action("enqueue_semantic")` | +| During Phase 1 archive write | No marker | Incomplete archive; next commit scans history/ for index, unaffected | +| Phase 1 archive complete but messages not cleared | No marker | Archive complete + messages still present = redundant but safe | +| During Phase 2 memory extraction/write | Redo marker exists | On startup: redo extraction + write + enqueue from archive | +| Phase 2 complete | Redo marker deleted | No recovery needed | -## TransactionContext +## LockContext -`TransactionContext` is an **async** context manager that encapsulates the full transaction lifecycle: +`LockContext` is an **async** context manager that encapsulates lock acquisition and release: ```python -from openviking.storage.transaction import TransactionContext, get_transaction_manager +from openviking.storage.transaction import LockContext, get_lock_manager -tx_manager = get_transaction_manager() +lock_manager = get_lock_manager() -async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx: - # Record undo (call before making changes) - seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot}) - # Execute change - delete_from_vector_store(uris) - # Mark completed - tx.mark_completed(seq) +# Point lock (write operations, semantic processing) +async with LockContext(lock_manager, [path], lock_mode="point"): + # Perform operations... + pass - # Register post-commit action (optional) - tx.add_post_action("enqueue_semantic", {"uri": uri, ...}) +# Subtree lock (delete operations) +async with LockContext(lock_manager, [path], lock_mode="subtree"): + # Perform operations... + pass - # Commit - await tx.commit() -# Auto-rollback if commit() not called +# MV lock (move operations) +async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst): + # Perform operations... + pass ``` **Lock modes**: | lock_mode | Use case | Behavior | |-----------|----------|----------| -| `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors | +| `point` | Write operations, semantic processing | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors | | `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors | | `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) | -| `none` | Lock-free operations | Skip lock acquisition, transition directly to EXEC status. Used for session.commit Phase 2 and other scenarios that don't require path mutual exclusion | + +**Exception handling**: `__aexit__` always releases locks and does not swallow exceptions. Lock acquisition failure raises `LockAcquisitionError`. ## Lock Types (POINT vs SUBTREE) @@ -188,33 +232,6 @@ The lock mechanism uses two lock types to handle different conflict patterns: - **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock. - **SUBTREE (S)**: Used for rm and mv operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants and ancestor directories for conflicting locks. -## Undo Log - -Each transaction maintains an Undo Log recording the reverse action for each step: - -| op_type | Forward operation | Rollback action | -|---------|-------------------|-----------------| -| `fs_mv` | Move file | Move back | -| `fs_rm` | Delete file | Skip (irreversible; rm is always the last step by design) | -| `fs_write_new` | Create new file/directory | Delete | -| `fs_mkdir` | Create directory | Delete | -| `vectordb_delete` | Delete index records | Restore from snapshot | -| `vectordb_upsert` | Insert index records | Delete | -| `vectordb_update_uri` | Update URI | Restore old value | - -Rollback rules: Only entries with `completed=True` are rolled back, in **reverse order**. Each step has independent try-catch (best-effort). During crash recovery, `recover_all=True` also reverses uncompleted entries to clean up partial operations. - -### Context Reconstruction - -VectorDB rollback operations require a `RequestContext` (containing account_id, user_id, agent_id, role). Since the original context is unavailable during crash recovery, `_ctx_*` fields are serialized into undo params when calling record_undo: - -- `_ctx_account_id`: Account ID -- `_ctx_user_id`: User ID -- `_ctx_agent_id`: Agent ID -- `_ctx_role`: Role - -During rollback, `_reconstruct_ctx()` rebuilds the context from these fields. If reconstruction fails (missing fields), the VectorDB rollback step is skipped with a warning. - ## Lock Mechanism ### Lock Protocol @@ -223,7 +240,7 @@ Lock file path: `{path}/.path.ovlock` Lock file content (Fencing Token): ``` -{transaction_id}:{time_ns}:{lock_type} +{handle_id}:{time_ns}:{lock_type} ``` Where `lock_type` is `P` (POINT) or `S` (SUBTREE). @@ -233,7 +250,7 @@ Where `lock_type` is `P` (POINT) or `S` (SUBTREE). ``` loop until timeout (poll interval: 200ms): 1. Check target directory exists - 2. Check if target directory is locked by another transaction + 2. Check if target directory is locked by another operation - Stale lock? -> remove and retry - Active lock? -> wait 3. Check all ancestor directories for SUBTREE locks @@ -241,8 +258,8 @@ loop until timeout (poll interval: 200ms): - Active lock? -> wait 4. Write POINT (P) lock file 5. TOCTOU double-check: re-scan ancestors for SUBTREE locks - - Conflict found: compare (timestamp, tx_id) - - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock + - Conflict found: compare (timestamp, handle_id) + - Later one (larger timestamp/handle_id) backs off (removes own lock) to prevent livelock - Wait and retry 6. Verify lock file ownership (fencing token matches) 7. Success @@ -255,19 +272,19 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError ``` loop until timeout (poll interval: 200ms): 1. Check target directory exists - 2. Check if target directory is locked by another transaction + 2. Check if target directory is locked by another operation - Stale lock? -> remove and retry - Active lock? -> wait 3. Check all ancestor directories for SUBTREE locks - Stale lock? -> remove and retry - Active lock? -> wait - 4. Scan all descendant directories for any locks by other transactions + 4. Scan all descendant directories for any locks by other operations - Stale lock? -> remove and retry - Active lock? -> wait 5. Write SUBTREE (S) lock file (only one file, at the root path) 6. TOCTOU double-check: re-scan descendants and ancestors - - Conflict found: compare (timestamp, tx_id) - - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock + - Conflict found: compare (timestamp, handle_id) + - Later one (larger timestamp/handle_id) backs off (removes own lock) to prevent livelock - Wait and retry 7. Verify lock file ownership (fencing token matches) 8. Success @@ -279,72 +296,33 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError **Stale lock detection**: PathLock checks the fencing token timestamp. Locks older than `lock_expire` (default 300s) are considered stale and are removed automatically during acquisition. -**Transaction timeout**: TransactionManager checks active transactions every 60 seconds. Transactions with `updated_at` exceeding the transaction timeout (default 3600s) are rolled back. - -## Transaction Journal - -Each transaction persists a journal in AGFS: +**In-process cleanup**: LockManager checks active LockHandles every 60 seconds. Handles created more than 3600 seconds ago are force-released. -``` -/local/_system/transactions/{tx_id}/journal.json -``` - -Contains: transaction ID, status, lock paths, init_info, undo_log, post_actions. - -### Lifecycle - -``` -Create transaction -> write journal (INIT) -Acquire lock -> update journal (ACQUIRE -> EXEC) -Execute changes -> update journal per step (mark undo entry completed) -Commit -> update journal (COMMIT + post_actions) - -> execute post_actions -> release locks -> delete journal -Rollback -> execute undo log -> release locks -> delete journal -``` +**Orphan locks**: Lock files left behind after a process crash are automatically removed via stale lock detection when any operation next attempts to acquire a lock on the same path. ## Crash Recovery -`TransactionManager.start()` automatically scans for residual journals on startup: +`LockManager.start()` automatically scans for leftover markers in `/local/_system/redo/` on startup: -| Journal status at crash | Recovery action | -|------------------------|----------------| -| `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal | -| `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal | -| `EXEC` / `FAIL` / `RELEASING` (`session_memory` operation) | Redo memory extraction + write from archive (`_redo_session_memory`) -> release locks -> delete journal | -| `EXEC` / `FAIL` / `RELEASING` (all undo entries completed) | Roll forward (treat as committed, replay post_actions) -> release locks -> delete journal | -| `EXEC` / `FAIL` / `RELEASING` (other) | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal | -| `INIT` / `ACQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) | +| Scenario | Recovery action | +|----------|----------------| +| session_memory extraction crash | Redo memory extraction + write + enqueue from archive | +| Crash while holding lock | Lock file remains in AGFS; stale detection auto-cleans on next acquisition (default 300s expiry) | +| Crash after enqueue, before worker processes | QueueFS SQLite persistence; worker auto-pulls after restart | +| Orphan index | Cleaned on L2 on-demand load | ### Defense Summary | Failure scenario | Defense | Recovery timing | |-----------------|--------|-----------------| -| Crash during transaction | Journal + undo log rollback | On restart | -| Crash after commit, before enqueue | Journal post_actions replay | On restart | -| Crash after enqueue, before worker processes | QueueFS SQLite persistence | Worker auto-pulls after restart | -| Crash during session.commit Phase 2 | Journal + redo (re-extract memories from archive) | On restart | -| Orphan index | Cleaned on L2 on-demand load | When user accesses | -| Crash between lock creation and journal update | init_info records intended lock paths; recovery checks and cleans orphan locks | On restart | - -## Transaction State Machine - -``` -INIT -> ACQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED - | - FAIL -> RELEASING -> RELEASED -``` - -- `INIT`: Transaction created, waiting for lock -- `ACQUIRE`: Acquiring lock -- `EXEC`: Transaction operations executing -- `COMMIT`: Committed, post_actions may be pending -- `FAIL`: Execution failed, entering rollback -- `RELEASING`: Releasing locks -- `RELEASED`: Locks released, transaction complete +| Crash during operation | Lock auto-expires + stale detection | Next acquisition of same path lock | +| Crash during session.commit Phase 2 | RedoLog marker + redo | On restart | +| Crash after enqueue, before worker | QueueFS SQLite persistence | Worker restart | +| Orphan index | L2 on-demand load cleanup | When user accesses | ## Configuration -The transaction mechanism is enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section: +Path locks are enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section: ```json { @@ -364,7 +342,7 @@ The transaction mechanism is enabled by default with no extra configuration need ### QueueFS Persistence -The transaction mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup. +The lock mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup. ## Related Documentation diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index d9ddd547..1a60fa89 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -640,15 +640,14 @@ For startup and deployment details see [Deployment](./03-deployment.md), for aut ## storage.transaction Section -The transaction mechanism is enabled by default and usually requires no configuration. **The default behavior is no-wait**: if the target path is already locked by another transaction, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry. +Path locks are enabled by default and usually require no configuration. **The default behavior is no-wait**: if the target path is already locked by another operation, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry. ```json { "storage": { "transaction": { "lock_timeout": 5.0, - "lock_expire": 300.0, - "max_parallel_locks": 8 + "lock_expire": 300.0 } } } @@ -658,9 +657,8 @@ The transaction mechanism is enabled by default and usually requires no configur |-----------|------|-------------|---------| | `lock_timeout` | float | Path lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds, then raise `LockAcquisitionError`. | `0.0` | | `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` | -| `max_parallel_locks` | int | Max parallel locks during recursive locking for rm/mv operations | `8` | -For details on the transaction mechanism, see [Transaction Mechanism](../concepts/09-transaction.md). +For details on the lock mechanism, see [Path Locks and Crash Recovery](../concepts/09-transaction.md). ## Full Schema @@ -698,8 +696,7 @@ For details on the transaction mechanism, see [Transaction Mechanism](../concept }, "transaction": { "lock_timeout": 0.0, - "lock_expire": 300.0, - "max_parallel_locks": 8 + "lock_expire": 300.0 }, "vectordb": { "backend": "local|remote", diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 2d42815a..31d09c54 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -1,6 +1,6 @@ -# 事务机制 +# 路径锁与崩溃恢复 -OpenViking 的事务机制保护核心写操作(`rm`、`mv`、`add_resource`、`session.commit`)的一致性,确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。 +OpenViking 通过**路径锁**和**Redo Log** 两个简单原语保护核心写操作(`rm`、`mv`、`add_resource`、`session.commit`)的一致性,确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。 ## 设计哲学 @@ -10,171 +10,214 @@ OpenViking 是上下文数据库,FS 是源数据,VectorDB 是派生索引。 ## 设计原则 -1. **事务只覆盖同步部分**:FS + VectorDB 操作在事务内;SemanticQueue/EmbeddingQueue 的 enqueue 在事务提交后执行(post_actions),它们是幂等的,失败可重试 -2. **默认生效**:所有数据操作命令自动开启事务机制,用户无需额外配置 -3. **写互斥**:通过路径锁保证同一路径同一时间只有一个写事务 -4. **Undo Log 模型**:变更前记录反向操作,失败时反序执行回滚 -5. **事务日志持久化**:每个事务在 AGFS 中写入 journal 文件,支持崩溃恢复 +1. **写互斥**:通过路径锁保证同一路径同一时间只有一个写操作 +2. **默认生效**:所有数据操作命令自动加锁,用户无需额外配置 +3. **锁即保护**:进入 LockContext 时加锁,退出时释放,没有 undo/journal/commit 语义 +4. **仅 session_memory 需要崩溃恢复**:通过 RedoLog 在进程崩溃后重做记忆提取 +5. **Queue 操作在锁外执行**:SemanticQueue/EmbeddingQueue 的 enqueue 是幂等的,失败可重试 ## 架构 ``` Service Layer (rm / mv / add_resource / session.commit) - │ - ▼ -┌──[TransactionContext 异步上下文管理器]──┐ -│ │ -│ 1. 创建事务 + 写 journal │ -│ 2. 获取路径锁(轮询 + 超时) │ -│ 3. 执行操作(FS + VectorDB) │ -│ 4. 记录 Undo Log(每步完成后标记) │ -│ 5. Commit / Rollback │ -│ 6. 执行 post_actions(enqueue 等) │ -│ 7. 释放锁 + 清理 journal │ -│ │ -│ 异常时:反序执行 Undo Log → 释放锁 │ -└─────────────────────────────────────────┘ - │ - ▼ + | + v ++--[LockContext 异步上下文管理器]-------+ +| | +| 1. 创建 LockHandle | +| 2. 获取路径锁(轮询 + 超时) | +| 3. 执行操作(FS + VectorDB) | +| 4. 释放锁 | +| | +| 异常时:自动释放锁,异常原样传播 | ++---------------------------------------+ + | + v Storage Layer (VikingFS, VectorDB, QueueManager) ``` +## 两个核心组件 + +### 组件 1:PathLock + LockManager + LockContext(路径锁系统) + +**PathLock** 实现基于文件的分布式锁,支持 POINT 和 SUBTREE 两种锁类型,使用 fencing token 防止 TOCTOU 竞争,自动检测并清理过期锁。 + +**LockHandle** 是轻量的锁持有者令牌: + +```python +@dataclass +class LockHandle: + id: str # 唯一标识,用于生成 fencing token + locks: list[str] # 已获取的锁文件路径 + created_at: float # 创建时间 +``` + +**LockManager** 是全局单例,管理锁生命周期: +- 创建/释放 LockHandle +- 后台清理泄漏的锁(进程内安全网) +- 启动时执行 RedoLog 恢复 + +**LockContext** 是异步上下文管理器,封装加锁/解锁生命周期: + +```python +from openviking.storage.transaction import LockContext, get_lock_manager + +async with LockContext(get_lock_manager(), [path], lock_mode="point") as handle: + # 在锁保护下执行操作 + ... +# 退出时自动释放锁(包括异常情况) +``` + +### 组件 2:RedoLog(崩溃恢复) + +仅用于 `session.commit` 的记忆提取阶段。操作前写标记,成功后删标记,启动时扫描遗留标记并重做。 + +``` +/local/_system/redo/{task_id}/redo.json +``` + +Memory 提取是幂等的 — 从同一个 archive 重新提取会得到相同结果。 + ## 一致性问题与解决方案 ### rm(uri) | 问题 | 方案 | |------|------| -| 先删文件再删索引 → 文件已删但索引残留 → 搜索返回不存在的文件 | **调换顺序**:先删索引再删文件。索引删除失败 → 文件和索引都在,搜索正常 | +| 先删文件再删索引 -> 文件已删但索引残留 -> 搜索返回不存在的文件 | **调换顺序**:先删索引再删文件。索引删除失败 -> 文件和索引都在,搜索正常 | + +**加锁策略**(根据目标类型区分): +- 删除**目录**:`lock_mode="subtree"`,锁目录自身 +- 删除**文件**:`lock_mode="point"`,锁文件的父目录 -事务流程: +操作流程: ``` -1. 开始事务,加锁(lock_mode="subtree") -2. 快照 VectorDB 中受影响的记录(用于回滚恢复) -3. 删除 VectorDB 索引 → 搜索立刻不可见 +1. 检查目标是目录还是文件,选择锁模式 +2. 获取锁 +3. 删除 VectorDB 索引 -> 搜索立刻不可见 4. 删除 FS 文件 -5. 提交 → 删锁 → 删 journal +5. 释放锁 ``` -回滚:第 4 步失败 → 从快照恢复 VectorDB 记录,文件和索引都在。 +VectorDB 删除失败 -> 直接抛异常,锁自动释放,文件和索引都在。FS 删除失败 -> VectorDB 已删但文件还在,重试即可。 ### mv(old_uri, new_uri) | 问题 | 方案 | |------|------| -| 文件移到新路径但索引指向旧路径 → 搜索返回旧路径(不存在) | 事务包装,移动失败则回滚 | +| 文件移到新路径但索引指向旧路径 -> 搜索返回旧路径(不存在) | 先 copy 再更新索引,失败时清理副本 | + +**加锁策略**(通过 `lock_mode="mv"` 自动处理): +- 移动**目录**:源路径和目标父目录各加 SUBTREE 锁 +- 移动**文件**:源的父目录和目标父目录各加 POINT 锁 -事务流程: +操作流程: ``` -1. 开始事务,加锁(lock_mode="mv",目录移动时源和目标均 SUBTREE) -2. 移动 FS 文件 -3. 更新 VectorDB 中的 URI -4. 提交 → 删锁 → 删 journal +1. 检查源是目录还是文件,确定 src_is_dir +2. 获取 mv 锁(内部根据 src_is_dir 选择 SUBTREE 或 POINT) +3. Copy 到新位置(源还在,安全) +4. 如果是目录,删除副本中被 cp 带过去的锁文件 +5. 更新 VectorDB 中的 URI + - 失败 -> 清理副本,源和旧索引都在,一致状态 +6. 删除源 +7. 释放锁 ``` -回滚:第 3 步失败 → 把文件移回原位。 - ### add_resource | 问题 | 方案 | |------|------| -| 文件从临时目录移到正式目录后崩溃 → 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 | - -首次添加和增量更新是两条独立路径: +| 文件从临时目录移到正式目录后崩溃 -> 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 | **首次添加**(target 不存在)— 在 `ResourceProcessor.process_resource` Phase 3.5 中处理: ``` -1. 开始事务,锁 final_uri 的父目录(lock_mode="point") -2. 记录 undo: fs_write_new(uri=dst_path) -3. agfs.mv 临时目录 → 正式位置 -4. 提交 → 删锁 → 删 journal -5. 清理临时目录 -6. 入队 SemanticMsg(uri=final, target_uri=None) → DAG 在 final 上跑,无 callback +1. 获取锁,锁 final_uri 的父目录(lock_mode="point") +2. agfs.mv 临时目录 -> 正式位置 +3. 释放锁 +4. 清理临时目录 +5. 入队 SemanticMsg -> DAG 在 final 上跑 ``` -崩溃恢复:undo 删除不完整的 dst_path;重新执行 `add_resource` 即可重试。 - **增量更新**(target 已存在)— temp 保持不动: ``` -1. 入队 SemanticMsg(uri=temp, target_uri=final) → DAG 在 temp 上跑 +1. 入队 SemanticMsg(uri=temp, target_uri=final) -> DAG 在 temp 上跑 2. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback -3. callback 内的每个 VikingFS.rm / VikingFS.mv 各自创建独立事务 +3. callback 内的每个 VikingFS.rm / VikingFS.mv 各自独立加锁 ``` -注意:DAG callback 不在外层包裹 TransactionContext。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立事务保护。外层锁会与内部锁冲突(如外层 POINT lock on target_path 与内部 `rm` 的 SUBTREE lock 冲突)导致死锁。 +注意:DAG callback 不在外层加锁。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立锁保护。外层锁会与内部锁冲突导致死锁。 ### session.commit() | 问题 | 方案 | |------|------| -| 消息已清空但 archive 未写入 → 对话数据丢失 | Phase 1 无事务(archive 不完整无副作用)+ Phase 2 redo 事务 | +| 消息已清空但 archive 未写入 -> 对话数据丢失 | Phase 1 无锁(archive 不完整无副作用)+ Phase 2 RedoLog | -LLM 调用耗时不可控(5s~60s+),不能放在持锁事务内。设计拆为两个阶段: +LLM 调用耗时不可控(5s~60s+),不能放在持锁操作内。设计拆为两个阶段: ``` -Phase 1 — 归档(无事务、无锁): +Phase 1 — 归档(无锁): 1. 生成归档摘要(LLM) 2. 写 archive(history/archive_N/messages.jsonl + 摘要) 3. 清空 messages.jsonl 4. 清空内存中的消息列表 -Phase 2 — 记忆提取 + 写入(事务,lock_mode="none",redo 语义): - 1. 记录 init_info(archive_uri、session_uri、用户身份信息) +Phase 2 — 记忆提取 + 写入(RedoLog): + 1. 写 redo 标记(archive_uri、session_uri、用户身份信息) 2. 从归档消息提取 memories(LLM) 3. 写当前消息状态 4. 写 relations - 5. 注册 post_action: enqueue SemanticQueue - 6. 提交 + 5. 直接 enqueue SemanticQueue + 6. 删除 redo 标记 ``` -**Redo 语义**:Phase 2 不注册 undo log。崩溃恢复时从 archive 重新执行记忆提取和写入(`_redo_session_memory`),而非回滚。 - **崩溃恢复分析**: | 崩溃时间点 | 状态 | 恢复动作 | |-----------|------|---------| -| Phase 1 写 archive 中途 | 无事务 | archive 不完整,下次 commit 从 history/ 扫描 index,不受影响 | -| Phase 1 archive 完成但 messages 未清空 | 无事务 | archive 完整 + messages 仍在 = 数据冗余但安全 | -| Phase 2 记忆提取/写入中途 | journal EXEC | 启动恢复:`_redo_session_memory` 从 archive 重做提取+写入+入队 | -| Phase 2 commit 后 | journal COMMIT | 启动恢复:重放 `post_action("enqueue_semantic")` | +| Phase 1 写 archive 中途 | 无标记 | archive 不完整,下次 commit 从 history/ 扫描 index,不受影响 | +| Phase 1 archive 完成但 messages 未清空 | 无标记 | archive 完整 + messages 仍在 = 数据冗余但安全 | +| Phase 2 记忆提取/写入中途 | redo 标记存在 | 启动恢复:从 archive 重做提取+写入+入队 | +| Phase 2 完成 | redo 标记已删 | 无需恢复 | -## TransactionContext +## LockContext -`TransactionContext` 是**异步**上下文管理器,封装事务的完整生命周期: +`LockContext` 是**异步**上下文管理器,封装锁的获取和释放: ```python -from openviking.storage.transaction import TransactionContext, get_transaction_manager +from openviking.storage.transaction import LockContext, get_lock_manager -tx_manager = get_transaction_manager() +lock_manager = get_lock_manager() -async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx: - # 记录 undo(变更前调用) - seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot}) - # 执行变更 - delete_from_vector_store(uris) - # 标记完成 - tx.mark_completed(seq) +# Point 锁(写操作、语义处理) +async with LockContext(lock_manager, [path], lock_mode="point"): + # 执行操作... + pass - # 注册提交后动作(可选) - tx.add_post_action("enqueue_semantic", {"uri": uri, ...}) +# Subtree 锁(删除操作) +async with LockContext(lock_manager, [path], lock_mode="subtree"): + # 执行操作... + pass - # 提交 - await tx.commit() -# 未 commit 时自动回滚 +# MV 锁(移动操作) +async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst): + # 执行操作... + pass ``` **锁模式**: | lock_mode | 用途 | 行为 | |-----------|------|------| -| `point` | 写操作 | 锁定指定路径;与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 | +| `point` | 写操作、语义处理 | 锁定指定路径;与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 | | `subtree` | 删除操作 | 锁定子树根节点;与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 | | `mv` | 移动操作 | 目录移动:源和目标均加 SUBTREE 锁;文件移动:源父目录和目标均加 POINT 锁(通过 `src_is_dir` 控制) | -| `none` | 无锁操作 | 跳过锁获取,直接进入 EXEC 状态。用于 session.commit Phase 2 等不需要路径互斥的场景 | + +**异常处理**:`__aexit__` 总是释放锁,不吞异常。获取锁失败时抛出 `LockAcquisitionError`。 ## 锁类型(POINT vs SUBTREE) @@ -188,33 +231,6 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as - **POINT (P)**:用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。 - **SUBTREE (S)**:用于删除和移动操作。逻辑上覆盖整个子树,但只在根目录写**一个锁文件**。获取前扫描所有后代和祖先目录确认无冲突锁。 -## Undo Log - -每个事务维护一个 Undo Log,记录每步操作的反向动作: - -| op_type | 正向操作 | 回滚动作 | -|---------|---------|---------| -| `fs_mv` | 移动文件 | 移回原位 | -| `fs_rm` | 删除文件 | 跳过(不可逆,设计上 rm 是最后一步) | -| `fs_write_new` | 创建新文件/目录 | 删除 | -| `fs_mkdir` | 创建目录 | 删除 | -| `vectordb_delete` | 删除索引记录 | 从快照恢复 | -| `vectordb_upsert` | 插入索引记录 | 删除 | -| `vectordb_update_uri` | 更新 URI | 恢复旧值 | - -回滚规则:只回滚 `completed=True` 的条目,**反序执行**。每步独立 try-catch(best-effort)。崩溃恢复时使用 `recover_all=True`,也会回滚未完成的条目以清理部分操作残留。 - -### 上下文重建 - -VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、agent_id、role)。由于崩溃恢复时原始上下文不可用,record_undo 时在 undo params 中序列化 `_ctx_*` 字段: - -- `_ctx_account_id`:账户 ID -- `_ctx_user_id`:用户 ID -- `_ctx_agent_id`:代理 ID -- `_ctx_role`:角色 - -回滚时通过 `_reconstruct_ctx()` 从这些字段重建上下文。若重建失败(字段缺失),该 VectorDB 回滚步骤将被跳过并记录警告。 - ## 锁机制 ### 锁协议 @@ -223,7 +239,7 @@ VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、age 锁文件内容(Fencing Token): ``` -{transaction_id}:{time_ns}:{lock_type} +{handle_id}:{time_ns}:{lock_type} ``` 其中 `lock_type` 为 `P`(POINT)或 `S`(SUBTREE)。 @@ -233,16 +249,16 @@ VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、age ``` 循环直到超时(轮询间隔:200ms): 1. 检查目标目录存在 - 2. 检查目标路径是否被其他事务锁定 - - 陈旧锁? → 移除后重试 - - 活跃锁? → 等待 + 2. 检查目标路径是否被其他操作锁定 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 3. 检查所有祖先目录是否有 SUBTREE 锁 - - 陈旧锁? → 移除后重试 - - 活跃锁? → 等待 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 4. 写入 POINT (P) 锁文件 5. TOCTOU 双重检查:重新扫描祖先目录的 SUBTREE 锁 - - 发现冲突:比较 (timestamp, tx_id) - - 后到者(更大的 timestamp/tx_id)主动让步(删除自己的锁),防止活锁 + - 发现冲突:比较 (timestamp, handle_id) + - 后到者(更大的 timestamp/handle_id)主动让步(删除自己的锁),防止活锁 - 等待后重试 6. 验证锁文件归属(fencing token 匹配) 7. 成功 @@ -255,19 +271,19 @@ VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、age ``` 循环直到超时(轮询间隔:200ms): 1. 检查目标目录存在 - 2. 检查目标路径是否被其他事务锁定 - - 陈旧锁? → 移除后重试 - - 活跃锁? → 等待 + 2. 检查目标路径是否被其他操作锁定 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 3. 检查所有祖先目录是否有 SUBTREE 锁 - - 陈旧锁? → 移除后重试 - - 活跃锁? → 等待 - 4. 扫描所有后代目录,检查是否有其他事务持有的锁 - - 陈旧锁? → 移除后重试 - - 活跃锁? → 等待 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 + 4. 扫描所有后代目录,检查是否有其他操作持有的锁 + - 陈旧锁? -> 移除后重试 + - 活跃锁? -> 等待 5. 写入 SUBTREE (S) 锁文件(只写一个文件,在根路径) 6. TOCTOU 双重检查:重新扫描后代目录和祖先目录 - - 发现冲突:比较 (timestamp, tx_id) - - 后到者(更大的 timestamp/tx_id)主动让步(删除自己的锁),防止活锁 + - 发现冲突:比较 (timestamp, handle_id) + - 后到者(更大的 timestamp/handle_id)主动让步(删除自己的锁),防止活锁 - 等待后重试 7. 验证锁文件归属(fencing token 匹配) 8. 成功 @@ -279,72 +295,33 @@ VectorDB 回滚操作需要 `RequestContext`(包含 account_id、user_id、age **陈旧锁检测**:PathLock 检查 fencing token 中的时间戳。超过 `lock_expire`(默认 300s)的锁被视为陈旧锁,在加锁过程中自动移除。 -**事务超时**:TransactionManager 每 60 秒检查活跃事务,`updated_at` 超过事务超时时间(默认 3600s)的事务强制回滚。 - -## 事务日志(Journal) - -每个事务在 AGFS 持久化一份 journal: +**进程内清理**:LockManager 每 60 秒检查活跃的 LockHandle,创建超过 3600 秒的 handle 强制释放。 -``` -/local/_system/transactions/{tx_id}/journal.json -``` - -内容包含:事务 ID、状态、锁路径、init_info、undo_log、post_actions。 - -### 生命周期 - -``` -创建事务 → 写 journal(INIT) -获取锁 → 更新 journal(ACQUIRE → EXEC) -执行变更 → 每步更新 journal(标记 undo entry completed) -提交 → 更新 journal(COMMIT + post_actions) - → 执行 post_actions → 删锁 → 删 journal -回滚 → 执行 undo log → 删锁 → 删 journal -``` +**孤儿锁**:进程崩溃后遗留的锁文件,在下次任何操作尝试获取同一路径锁时,通过 stale lock 检测自动移除。 ## 崩溃恢复 -`TransactionManager.start()` 启动时自动扫描残留 journal: +`LockManager.start()` 启动时自动扫描 `/local/_system/redo/` 目录中的遗留标记: -| 崩溃时 journal 状态 | 恢复方式 | -|---------------------|---------| -| `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal | -| `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal | -| `EXEC` / `FAIL` / `RELEASING`(`session_memory` 操作) | 从 archive 重做记忆提取+写入(`_redo_session_memory`) → 删锁 → 删 journal | -| `EXEC` / `FAIL` / `RELEASING`(所有 undo 均 completed) | 前滚(视为已提交,重放 post_actions) → 删锁 → 删 journal | -| `EXEC` / `FAIL` / `RELEASING`(其他) | 执行 undo log 回滚(`recover_all=True`) → 删锁 → 删 journal | -| `INIT` / `ACQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal(变更未执行) | +| 场景 | 恢复方式 | +|------|---------| +| session_memory 提取中途崩溃 | 从 archive 重做记忆提取 + 写入 + enqueue | +| 锁持有期间崩溃 | 锁文件留在 AGFS,下次获取时 stale 检测自动清理(默认 300s 过期)| +| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化,worker 重启后自动拉取 | +| 孤儿索引 | L2 按需加载时清理 | ### 防线总结 | 异常场景 | 防线 | 恢复时机 | |---------|------|---------| -| 事务内崩溃 | journal + undo log 回滚 | 重启时 | -| 提交后 enqueue 前崩溃 | journal post_actions 重放 | 重启时 | -| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后自动拉取 | -| session.commit Phase 2 中崩溃 | journal + redo(从 archive 重做记忆提取) | 重启时 | +| 操作中途崩溃 | 锁自动过期 + stale 检测 | 下次获取同路径锁时 | +| session.commit Phase 2 崩溃 | RedoLog 标记 + 重做 | 重启时 | +| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后 | | 孤儿索引 | L2 按需加载时清理 | 用户访问时 | -| 加锁后 journal 更新前崩溃 | init_info 记录预期锁路径,恢复时检查并清理孤儿锁 | 重启时 | - -## 事务状态机 - -``` -INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED - ↓ - FAIL → RELEASING → RELEASED -``` - -- `INIT`:事务已创建,等待锁获取 -- `ACQUIRE`:正在获取锁 -- `EXEC`:事务操作执行中 -- `COMMIT`:已提交,可能有 post_actions 待执行 -- `FAIL`:执行失败,进入回滚 -- `RELEASING`:正在释放锁 -- `RELEASED`:锁已释放,事务结束 ## 配置 -事务机制默认启用,无需额外配置。**默认不等待**:若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试,可通过 `storage.transaction` 段配置: +路径锁默认启用,无需额外配置。**默认不等待**:若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试,可通过 `storage.transaction` 段配置: ```json { @@ -360,11 +337,11 @@ INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED | 参数 | 类型 | 说明 | 默认值 | |------|------|------|--------| | `lock_timeout` | float | 获取锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间 | `0.0` | -| `lock_expire` | float | 锁过期时间(秒),超过此时间的事务锁将被视为陈旧锁并强制释放 | `300.0` | +| `lock_expire` | float | 锁过期时间(秒),超过此时间的锁将被视为陈旧锁并强制释放 | `300.0` | ### QueueFS 持久化 -事务机制依赖 QueueFS 使用 SQLite 后端,确保 enqueue 的任务在进程重启后可恢复。这是默认配置,无需手动设置。 +路径锁机制依赖 QueueFS 使用 SQLite 后端,确保 enqueue 的任务在进程重启后可恢复。这是默认配置,无需手动设置。 ## 相关文档 diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index e4befcde..b0954bbf 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -615,15 +615,14 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 ## storage.transaction 段 -事务机制默认启用,通常无需配置。**默认行为是不等待**:若目标路径已被其他事务锁定,操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试,请将 `lock_timeout` 设为正数。 +路径锁默认启用,通常无需配置。**默认行为是不等待**:若目标路径已被其他操作锁定,操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试,请将 `lock_timeout` 设为正数。 ```json { "storage": { "transaction": { "lock_timeout": 5.0, - "lock_expire": 300.0, - "max_parallel_locks": 8 + "lock_expire": 300.0 } } } @@ -632,10 +631,9 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 | 参数 | 类型 | 说明 | 默认值 | |------|------|------|--------| | `lock_timeout` | float | 获取路径锁的等待超时(秒)。`0` = 立即失败(默认);`> 0` = 最多等待此时间后抛出 `LockAcquisitionError` | `0.0` | -| `lock_expire` | float | 锁过期时间(秒)。超过此时间的事务锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` | -| `max_parallel_locks` | int | rm/mv 操作递归加锁时的最大并行数 | `8` | +| `lock_expire` | float | 锁过期时间(秒)。超过此时间的锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` | -事务机制的详细说明见 [事务机制](../concepts/09-transaction.md)。 +路径锁机制的详细说明见 [路径锁与崩溃恢复](../concepts/09-transaction.md)。 ## 完整 Schema @@ -673,8 +671,7 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 }, "transaction": { "lock_timeout": 0.0, - "lock_expire": 300.0, - "max_parallel_locks": 8 + "lock_expire": 300.0 }, "vectordb": { "backend": "local|remote", diff --git a/openviking/async_client.py b/openviking/async_client.py index 680b6ee8..b87b05ff 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -97,10 +97,10 @@ async def reset(cls) -> None: await cls._instance.close() cls._instance = None - # Also reset transaction manager singleton - from openviking.storage.transaction import reset_transaction_manager + # Also reset lock manager singleton + from openviking.storage.transaction import reset_lock_manager - reset_transaction_manager() + reset_lock_manager() # ============= Session methods ============= diff --git a/openviking/server/routers/content.py b/openviking/server/routers/content.py index 9b4d0279..2463cc08 100644 --- a/openviking/server/routers/content.py +++ b/openviking/server/routers/content.py @@ -102,7 +102,7 @@ async def reindex( database. If regenerate=True, also regenerates L0/L1 summaries via LLM before re-embedding. - Uses transaction locking to prevent concurrent reindexes on the same URI. + Uses path locking to prevent concurrent reindexes on the same URI. Set wait=False to run in the background and track progress via task API. """ from openviking.service.task_tracker import get_task_tracker @@ -164,27 +164,17 @@ async def _do_reindex( regenerate: bool, ctx: RequestContext, ) -> dict: - """Execute reindex within a transaction.""" - from openviking.storage.transaction import get_transaction_manager + """Execute reindex within a lock scope.""" + from openviking.storage.transaction import LockContext, get_lock_manager - tm = get_transaction_manager() - tx = tm.create_transaction(init_info={"uri": uri, "regenerate": regenerate}) - await tm.begin(tx.id) + viking_fs = service.viking_fs + path = viking_fs._uri_to_path(uri, ctx=ctx) - try: - await tm.acquire_lock_normal(tx.id, uri) + async with LockContext(get_lock_manager(), [path], lock_mode="point"): if regenerate: - result = await service.resources.summarize([uri], ctx=ctx) + return await service.resources.summarize([uri], ctx=ctx) else: - result = await service.resources.build_index([uri], ctx=ctx) - await tm.commit(tx.id) - return result - except Exception: - try: - await tm.rollback(tx.id) - except Exception: - pass - raise + return await service.resources.build_index([uri], ctx=ctx) async def _background_reindex_tracked( diff --git a/openviking/server/routers/observer.py b/openviking/server/routers/observer.py index 4d214cbf..e1910596 100644 --- a/openviking/server/routers/observer.py +++ b/openviking/server/routers/observer.py @@ -72,13 +72,13 @@ async def observer_vlm( return Response(status="ok", result=_component_to_dict(component)) -@router.get("/transaction") -async def observer_transaction( +@router.get("/lock") +async def observer_lock( _ctx: RequestContext = Depends(get_request_context), ): - """Get transaction system status.""" + """Get lock system status.""" service = get_service() - component = service.debug.observer.transaction + component = service.debug.observer.lock return Response(status="ok", result=_component_to_dict(component)) diff --git a/openviking/service/core.py b/openviking/service/core.py index 4c5a2670..8fd1c701 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -23,7 +23,7 @@ from openviking.storage import VikingDBManager from openviking.storage.collection_schemas import init_context_collection from openviking.storage.queuefs.queue_manager import QueueManager, init_queue_manager -from openviking.storage.transaction import TransactionManager, init_transaction_manager +from openviking.storage.transaction import LockManager, init_lock_manager from openviking.storage.viking_fs import VikingFS, init_viking_fs from openviking.utils.resource_processor import ResourceProcessor from openviking.utils.skill_processor import SkillProcessor @@ -75,7 +75,7 @@ def __init__( self._resource_processor: Optional[ResourceProcessor] = None self._skill_processor: Optional[SkillProcessor] = None self._session_compressor: Optional[SessionCompressor] = None - self._transaction_manager: Optional[TransactionManager] = None + self._lock_manager: Optional[LockManager] = None self._directory_initializer: Optional[DirectoryInitializer] = None # Sub-services @@ -142,16 +142,14 @@ def _init_storage( if self._queue_manager: self._queue_manager.setup_standard_queues(self._vikingdb_manager, start=False) - # Initialize TransactionManager (fail-fast if AGFS missing) + # Initialize LockManager (fail-fast if AGFS missing) if self._agfs_client is None: - raise RuntimeError("AGFS client not initialized for TransactionManager") + raise RuntimeError("AGFS client not initialized for LockManager") tx_cfg = config.transaction - self._transaction_manager = init_transaction_manager( + self._lock_manager = init_lock_manager( agfs=self._agfs_client, - max_parallel_locks=tx_cfg.max_parallel_locks, lock_timeout=tx_cfg.lock_timeout, lock_expire=tx_cfg.lock_expire, - vector_store=self._vikingdb_manager, ) @property @@ -170,9 +168,9 @@ def vikingdb_manager(self) -> Optional[VikingDBManager]: return self._vikingdb_manager @property - def transaction_manager(self) -> Optional[TransactionManager]: - """Get TransactionManager instance.""" - return self._transaction_manager + def lock_manager(self) -> Optional[LockManager]: + """Get LockManager instance.""" + return self._lock_manager @property def session_compressor(self) -> Optional[SessionCompressor]: @@ -293,10 +291,10 @@ async def initialize(self) -> None: self._skill_processor = SkillProcessor(vikingdb=self._vikingdb_manager) self._session_compressor = SessionCompressor(vikingdb=self._vikingdb_manager) - # Start TransactionManager if initialized - if self._transaction_manager: - await self._transaction_manager.start() - logger.info("TransactionManager started") + # Start LockManager if initialized + if self._lock_manager: + await self._lock_manager.start() + logger.info("LockManager started") # Wire up sub-services self._fs_service.set_viking_fs(self._viking_fs) @@ -324,9 +322,9 @@ async def initialize(self) -> None: async def close(self) -> None: """Close OpenViking and release resources.""" - if self._transaction_manager: - await self._transaction_manager.stop() - self._transaction_manager = None + if self._lock_manager: + await self._lock_manager.stop() + self._lock_manager = None if self._vikingdb_manager: self._vikingdb_manager.mark_closing() diff --git a/openviking/service/debug_service.py b/openviking/service/debug_service.py index 7dffff65..b99b4e73 100644 --- a/openviking/service/debug_service.py +++ b/openviking/service/debug_service.py @@ -9,14 +9,14 @@ from openviking.storage import VikingDBManager from openviking.storage.observers import ( + LockObserver, QueueObserver, RetrievalObserver, - TransactionObserver, VikingDBObserver, VLMObserver, ) from openviking.storage.queuefs import get_queue_manager -from openviking.storage.transaction import get_transaction_manager +from openviking.storage.transaction import get_lock_manager from openviking_cli.utils.config import OpenVikingConfig @@ -136,20 +136,20 @@ def vlm(self) -> ComponentStatus: ) @property - def transaction(self) -> ComponentStatus: - """Get transaction status.""" + def lock(self) -> ComponentStatus: + """Get lock system status.""" try: - transaction_manager = get_transaction_manager() + lock_manager = get_lock_manager() except Exception: return ComponentStatus( - name="transaction", + name="lock", is_healthy=False, has_errors=True, status="Not initialized", ) - observer = TransactionObserver(transaction_manager) + observer = LockObserver(lock_manager) return ComponentStatus( - name="transaction", + name="lock", is_healthy=observer.is_healthy(), has_errors=observer.has_errors(), status=observer.get_status_table(), @@ -173,7 +173,7 @@ def system(self) -> SystemStatus: "queue": self.queue, "vikingdb": self.vikingdb, "vlm": self.vlm, - "transaction": self.transaction, + "lock": self.lock, "retrieval": self.retrieval, } errors = [f"{c.name} has errors" for c in components.values() if c.has_errors] diff --git a/openviking/session/session.py b/openviking/session/session.py index 5726b8e3..c0f87bd9 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -226,10 +226,12 @@ def commit(self) -> Dict[str, Any]: async def commit_async(self) -> Dict[str, Any]: """Async commit session: two-phase approach. - Phase 1 (Archive, no transaction): Write archive, clear messages. - Phase 2 (Memory, transaction with redo semantics): Extract memories, write, enqueue. + Phase 1 (Archive): Write archive, clear messages. + Phase 2 (Memory, redo-log protected): Extract memories, write, enqueue. """ - from openviking.storage.transaction import TransactionContext, get_transaction_manager + import uuid + + from openviking.storage.transaction import get_lock_manager result = { "session_id": self.session_id, @@ -243,9 +245,7 @@ async def commit_async(self) -> Dict[str, Any]: get_current_telemetry().set("memory.extracted", 0) return result - tx_manager = get_transaction_manager() - - # ===== Preparation (no transaction) ===== + # ===== Preparation ===== self._compression.compression_index += 1 messages_to_archive = self._messages.copy() @@ -253,7 +253,7 @@ async def commit_async(self) -> Dict[str, Any]: archive_abstract = self._extract_abstract_from_summary(summary) archive_overview = summary - # ===== Phase 1: Archive (no transaction, no lock) ===== + # ===== Phase 1: Archive (no lock) ===== archive_uri = ( f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}" ) @@ -273,54 +273,57 @@ async def commit_async(self) -> Dict[str, Any]: f"history/archive_{self._compression.compression_index:03d}/" ) - # ===== Phase 2: Memory extraction + write (transaction, redo semantics) ===== - async with TransactionContext( - tx_manager, - "session_memory", - [], - lock_mode="none", - ) as tx: - # Store redo info so _recover_one can redo from archive on crash - tx.record.init_info.update( - { - "archive_uri": archive_uri, - "session_uri": self._session_uri, - "account_id": self.ctx.account_id, - "user_id": self.ctx.user.user_id, - "agent_id": self.ctx.user.agent_id, - "role": self.ctx.role.value, - } - ) + # ===== Phase 2: Memory extraction + write (redo-log protected) ===== + redo_log = get_lock_manager().redo_log + task_id = str(uuid.uuid4()) + redo_log.write_pending( + task_id, + { + "archive_uri": archive_uri, + "session_uri": self._session_uri, + "account_id": self.ctx.account_id, + "user_id": self.ctx.user.user_id, + "agent_id": self.ctx.user.agent_id, + "role": self.ctx.role.value, + }, + ) - if self._session_compressor: - logger.info( - f"Starting memory extraction from {len(messages_to_archive)} archived messages" - ) - memories = await self._session_compressor.extract_long_term_memories( - messages=messages_to_archive, - user=self.user, - session_id=self.session_id, - ctx=self.ctx, - ) - logger.info(f"Extracted {len(memories)} memories") - result["memories_extracted"] = len(memories) - self._stats.memories_extracted += len(memories) - get_current_telemetry().set("memory.extracted", len(memories)) - - await self._write_to_agfs_async(self._messages) - await self._write_relations_async() - tx.add_post_action( - "enqueue_semantic", - { - "uri": self._session_uri, - "context_type": "memory", - "account_id": self.ctx.account_id, - "user_id": self.ctx.user.user_id, - "agent_id": self.ctx.user.agent_id, - "role": self.ctx.role.value, - }, + if self._session_compressor: + logger.info( + f"Starting memory extraction from {len(messages_to_archive)} archived messages" + ) + memories = await self._session_compressor.extract_long_term_memories( + messages=messages_to_archive, + user=self.user, + session_id=self.session_id, + ctx=self.ctx, ) - await tx.commit() + logger.info(f"Extracted {len(memories)} memories") + result["memories_extracted"] = len(memories) + self._stats.memories_extracted += len(memories) + get_current_telemetry().set("memory.extracted", len(memories)) + + await self._write_to_agfs_async(self._messages) + await self._write_relations_async() + + # Enqueue semantic processing directly + from openviking.storage.queuefs import get_queue_manager + from openviking.storage.queuefs.semantic_msg import SemanticMsg + + queue_manager = get_queue_manager() + if queue_manager: + msg = SemanticMsg( + uri=self._session_uri, + context_type="memory", + account_id=self.ctx.account_id, + user_id=self.ctx.user.user_id, + agent_id=self.ctx.user.agent_id, + role=self.ctx.role.value, + ) + semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC) + await semantic_queue.enqueue(msg) + + redo_log.mark_done(task_id) # Update active_count active_count_updated = await self._update_active_counts_async() diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py index 7f6a483b..010200e7 100644 --- a/openviking/storage/errors.py +++ b/openviking/storage/errors.py @@ -31,13 +31,9 @@ class SchemaError(StorageException): """Raised when schema validation fails.""" -class TransactionError(VikingDBException): - """Raised when a transaction operation fails.""" +class LockError(VikingDBException): + """Raised when a lock operation fails.""" -class LockAcquisitionError(TransactionError): +class LockAcquisitionError(LockError): """Raised when lock acquisition fails.""" - - -class TransactionRollbackError(TransactionError): - """Raised when transaction rollback fails.""" diff --git a/openviking/storage/observers/__init__.py b/openviking/storage/observers/__init__.py index 4a36700a..dae5aed3 100644 --- a/openviking/storage/observers/__init__.py +++ b/openviking/storage/observers/__init__.py @@ -1,17 +1,17 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 from .base_observer import BaseObserver +from .lock_observer import LockObserver from .queue_observer import QueueObserver from .retrieval_observer import RetrievalObserver -from .transaction_observer import TransactionObserver from .vikingdb_observer import VikingDBObserver from .vlm_observer import VLMObserver __all__ = [ "BaseObserver", + "LockObserver", "QueueObserver", "RetrievalObserver", - "TransactionObserver", "VikingDBObserver", "VLMObserver", ] diff --git a/openviking/storage/observers/lock_observer.py b/openviking/storage/observers/lock_observer.py new file mode 100644 index 00000000..92521790 --- /dev/null +++ b/openviking/storage/observers/lock_observer.py @@ -0,0 +1,71 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""LockObserver: Lock system observability.""" + +import time +from typing import Any, Dict, List + +from openviking.storage.observers.base_observer import BaseObserver +from openviking.storage.transaction.lock_manager import LockManager +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class LockObserver(BaseObserver): + """Observability tool for the lock system.""" + + def __init__(self, lock_manager: LockManager): + self._manager = lock_manager + + def get_active_locks(self) -> List[Dict[str, Any]]: + """Return info about every active lock handle.""" + now = time.time() + return [ + { + "id": h.id, + "lock_count": len(h.locks), + "created_at": h.created_at, + "duration_seconds": round(now - h.created_at, 1), + } + for h in self._manager.get_active_handles().values() + ] + + def get_hanging_locks(self, threshold: float = 600) -> List[Dict[str, Any]]: + """Return locks that have been held longer than *threshold* seconds.""" + now = time.time() + return [lock for lock in self.get_active_locks() if now - lock["created_at"] > threshold] + + # ------ BaseObserver interface ------ + + def get_status_table(self) -> str: + locks = self.get_active_locks() + if not locks: + return "No active locks." + + from tabulate import tabulate + + data = [ + { + "Handle ID": l["id"][:8] + "...", + "Locks": l["lock_count"], + "Duration": f"{l['duration_seconds']}s", + "Created": time.strftime("%H:%M:%S", time.localtime(l["created_at"])), + } + for l in locks + ] + data.append( + { + "Handle ID": f"TOTAL ({len(locks)})", + "Locks": sum(l["lock_count"] for l in locks), + "Duration": "", + "Created": "", + } + ) + return tabulate(data, headers="keys", tablefmt="pretty") + + def is_healthy(self) -> bool: + return not self.get_hanging_locks(600) + + def has_errors(self) -> bool: + return bool(self.get_hanging_locks(600)) diff --git a/openviking/storage/observers/transaction_observer.py b/openviking/storage/observers/transaction_observer.py deleted file mode 100644 index e29b7665..00000000 --- a/openviking/storage/observers/transaction_observer.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -TransactionObserver: Transaction system observability tool. - -Provides methods to observe and report transaction manager status. -""" - -import time -from typing import Any, Dict - -from openviking.storage.observers.base_observer import BaseObserver -from openviking.storage.transaction import TransactionManager -from openviking.storage.transaction.transaction_record import TransactionStatus -from openviking_cli.utils import run_async -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -class TransactionObserver(BaseObserver): - """ - TransactionObserver: System observability tool for transaction management. - - Provides methods to query transaction status and format output. - """ - - def __init__(self, transaction_manager: TransactionManager): - """Initialize transaction observer. - - Args: - transaction_manager: Transaction manager instance to observe - """ - self._transaction_manager = transaction_manager - - async def get_status_table_async(self) -> str: - """Get transaction status table asynchronously. - - Returns: - Formatted table string showing transaction status - """ - if not self._transaction_manager: - return "Transaction manager not initialized." - - transactions = self._transaction_manager.get_active_transactions() - - if not transactions: - return "No active transactions." - - return self._format_status_as_table(transactions) - - def get_status_table(self) -> str: - """Get transaction status table synchronously. - - Returns: - Formatted table string showing transaction status - """ - return run_async(self.get_status_table_async()) - - def __str__(self) -> str: - """String representation returns status table. - - Returns: - Formatted table string - """ - return self.get_status_table() - - def _format_status_as_table(self, transactions: Dict[str, Any]) -> str: - """Format transaction statuses as a table. - - Args: - transactions: Dict mapping transaction IDs to TransactionRecord - - Returns: - Formatted table string - """ - from tabulate import tabulate - - data = [] - - # Group transactions by status - status_counts = { - TransactionStatus.INIT: 0, - TransactionStatus.ACQUIRE: 0, - TransactionStatus.EXEC: 0, - TransactionStatus.COMMIT: 0, - TransactionStatus.FAIL: 0, - TransactionStatus.RELEASING: 0, - TransactionStatus.RELEASED: 0, - } - - for tx_id, tx in transactions.items(): - duration = time.time() - tx.created_at - duration_str = f"{duration:.1f}s" - - status_counts[tx.status] += 1 - - data.append( - { - "Transaction ID": tx_id[:8] + "...", - "Status": str(tx.status), - "Locks": len(tx.locks), - "Duration": duration_str, - "Created": time.strftime("%H:%M:%S", time.localtime(tx.created_at)), - } - ) - - status_priority = { - TransactionStatus.EXEC: 0, - TransactionStatus.ACQUIRE: 1, - TransactionStatus.RELEASING: 2, - TransactionStatus.INIT: 3, - TransactionStatus.COMMIT: 4, - TransactionStatus.FAIL: 5, - TransactionStatus.RELEASED: 6, - } - - data.sort(key=lambda x: status_priority.get(TransactionStatus(x["Status"]), 99)) - - total = len(transactions) - total_locks = sum(len(tx.locks) for tx in transactions.values()) - - summary_row = { - "Transaction ID": f"TOTAL ({total})", - "Status": "", - "Locks": total_locks, - "Duration": "", - "Created": "", - } - data.append(summary_row) - - return tabulate(data, headers="keys", tablefmt="pretty") - - def is_healthy(self) -> bool: - """Check if transaction system is healthy. - - Returns: - True if system is healthy, False otherwise - """ - return not self.has_errors() - - def has_errors(self) -> bool: - """Check if transaction system has any errors. - - Returns: - True if errors (failed transactions) exist, False otherwise - """ - if not self._transaction_manager: - return True - - transactions = self._transaction_manager.get_active_transactions() - - # Check for failed transactions - for tx_id, tx in transactions.items(): - if tx.status == TransactionStatus.FAIL: - logger.warning(f"Found failed transaction: {tx_id}") - return True - - return False - - def get_failed_transactions(self) -> Dict[str, Any]: - """Get all failed transactions. - - Returns: - Dict mapping transaction IDs to failed TransactionRecord - """ - if not self._transaction_manager: - return {} - - transactions = self._transaction_manager.get_active_transactions() - return { - tx_id: tx for tx_id, tx in transactions.items() if tx.status == TransactionStatus.FAIL - } - - def get_hanging_transactions(self, timeout_threshold: int = 300) -> Dict[str, Any]: - """Get transactions that have been running longer than threshold. - - Args: - timeout_threshold: Timeout threshold in seconds (default: 300 = 5 minutes) - - Returns: - Dict mapping transaction IDs to TransactionRecord that exceed threshold - """ - if not self._transaction_manager: - return {} - - transactions = self._transaction_manager.get_active_transactions() - current_time = time.time() - - return { - tx_id: tx - for tx_id, tx in transactions.items() - if current_time - tx.created_at > timeout_threshold - } - - def get_status_summary(self) -> Dict[str, int]: - """Get summary of transaction counts by status. - - Returns: - Dict mapping status strings to counts - """ - if not self._transaction_manager: - return {} - - transactions = self._transaction_manager.get_active_transactions() - - summary = { - "INIT": 0, - "ACQUIRE": 0, - "EXEC": 0, - "COMMIT": 0, - "FAIL": 0, - "RELEASING": 0, - "RELEASED": 0, - "TOTAL": 0, - } - - for tx in transactions.values(): - summary[str(tx.status)] += 1 - summary["TOTAL"] += 1 - - return summary diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 32ac0501..4ee10a93 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -506,7 +506,7 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]: async def _overview_task(self, dir_uri: str) -> None: from openviking.storage.errors import LockAcquisitionError - from openviking.storage.transaction import TransactionContext, get_transaction_manager + from openviking.storage.transaction import LockContext, get_lock_manager node = self._nodes.get(dir_uri) if not node: @@ -538,18 +538,13 @@ async def _overview_task(self, dir_uri: str) -> None: dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx) try: - # No undo entries recorded: semantic files (.overview.md / .abstract.md) are - # regenerable, so residual writes after a crash are acceptable. - async with TransactionContext( - get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point" - ) as tx: + async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"): await self._viking_fs.write_file( f"{dir_uri}/.overview.md", overview, ctx=self._ctx ) await self._viking_fs.write_file( f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx ) - await tx.commit() except LockAcquisitionError: logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping") diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 830dbd12..0db4019b 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -285,17 +285,13 @@ async def _process_single_directory( ) -> None: """Process single directory, generate .abstract.md and .overview.md.""" from openviking.storage.errors import LockAcquisitionError - from openviking.storage.transaction import TransactionContext, get_transaction_manager + from openviking.storage.transaction import LockContext, get_lock_manager viking_fs = get_viking_fs() dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx) try: - # No undo entries recorded: semantic files (.overview.md / .abstract.md) are - # regenerable, so residual writes after a crash are acceptable. - async with TransactionContext( - get_transaction_manager(), "semantic", [dir_path], lock_mode="point" - ) as tx: + async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"): # 1. Collect .abstract.md from subdirectories children_abstracts = await self._collect_children_abstracts(children_uris) @@ -335,8 +331,6 @@ async def _process_single_directory( for result in results: if isinstance(result, Exception): logger.error(f"Vectorization failed: {result}", exc_info=True) - - await tx.commit() except LockAcquisitionError: logger.info(f"[SemanticProcessor] {uri} does not exist or is locked, skipping") diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py index afbc3e1e..0fca8816 100644 --- a/openviking/storage/transaction/__init__.py +++ b/openviking/storage/transaction/__init__.py @@ -3,34 +3,28 @@ """ Transaction module for OpenViking. -Provides transaction management and lock mechanisms for data operations. +Provides path-lock management and redo-log crash recovery. """ -from openviking.storage.transaction.context_manager import TransactionContext -from openviking.storage.transaction.journal import TransactionJournal -from openviking.storage.transaction.path_lock import PathLock -from openviking.storage.transaction.transaction_manager import ( - TransactionManager, - get_transaction_manager, - init_transaction_manager, - reset_transaction_manager, -) -from openviking.storage.transaction.transaction_record import ( - TransactionRecord, - TransactionStatus, +from openviking.storage.transaction.lock_context import LockContext +from openviking.storage.transaction.lock_handle import LockHandle, LockOwner +from openviking.storage.transaction.lock_manager import ( + LockManager, + get_lock_manager, + init_lock_manager, + reset_lock_manager, ) -from openviking.storage.transaction.undo import UndoEntry, execute_rollback +from openviking.storage.transaction.path_lock import PathLock +from openviking.storage.transaction.redo_log import RedoLog __all__ = [ + "LockContext", + "LockHandle", + "LockManager", + "LockOwner", "PathLock", - "TransactionContext", - "TransactionJournal", - "TransactionManager", - "TransactionRecord", - "TransactionStatus", - "UndoEntry", - "execute_rollback", - "get_transaction_manager", - "init_transaction_manager", - "reset_transaction_manager", + "RedoLog", + "get_lock_manager", + "init_lock_manager", + "reset_lock_manager", ] diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py deleted file mode 100644 index 09697e10..00000000 --- a/openviking/storage/transaction/context_manager.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Transaction context manager for OpenViking. - -Provides an async context manager that wraps a set of operations in a -transaction with automatic rollback on failure. -""" - -from typing import Any, Dict, List, Optional - -from openviking.storage.errors import LockAcquisitionError, TransactionError -from openviking.storage.transaction.transaction_record import TransactionRecord -from openviking.storage.transaction.undo import UndoEntry -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -class TransactionContext: - """Async context manager for transactional operations. - - Usage:: - - async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx: - seq = tx.record_undo("fs_rm", {"uri": uri}) - # ... do work ... - tx.mark_completed(seq) - await tx.commit() - """ - - def __init__( - self, - tx_manager: Any, - operation: str, - lock_paths: List[str], - lock_mode: str = "point", - mv_dst_path: Optional[str] = None, - src_is_dir: bool = True, - ): - self._tx_manager = tx_manager - self._operation = operation - self._lock_paths = lock_paths - self._lock_mode = lock_mode - self._mv_dst_path = mv_dst_path - self._src_is_dir = src_is_dir - self._record: Optional[TransactionRecord] = None - self._committed = False - self._sequence = 0 - - @property - def record(self) -> TransactionRecord: - if self._record is None: - raise TransactionError("Transaction not started") - return self._record - - async def __aenter__(self) -> "TransactionContext": - self._record = self._tx_manager.create_transaction( - init_info={ - "operation": self._operation, - "lock_paths": self._lock_paths, - "lock_mode": self._lock_mode, - "mv_dst_path": self._mv_dst_path, - } - ) - tx_id = self._record.id - - # Write journal BEFORE acquiring locks so that crash recovery can - # find orphan locks via init_info even if the process dies between - # lock creation and journal update. - try: - self._tx_manager.journal.write(self._record.to_journal()) - except Exception as e: - logger.warning(f"[Transaction] Failed to write journal for {tx_id}: {e}") - - success = False - if self._lock_mode == "none": - # No lock acquisition — transition directly to EXEC status - tx = self._tx_manager.get_transaction(tx_id) - if tx: - from openviking.storage.transaction.transaction_record import TransactionStatus - - tx.update_status(TransactionStatus.EXEC) - success = True - elif self._lock_mode == "subtree": - for path in self._lock_paths: - success = await self._tx_manager.acquire_lock_subtree(tx_id, path) - if not success: - break - elif self._lock_mode == "mv": - if len(self._lock_paths) < 1 or not self._mv_dst_path: - raise TransactionError("mv lock mode requires lock_paths[0] and mv_dst_path") - success = await self._tx_manager.acquire_lock_mv( - tx_id, - self._lock_paths[0], - self._mv_dst_path, - src_is_dir=self._src_is_dir, - ) - else: - # "point" mode (default) - for path in self._lock_paths: - success = await self._tx_manager.acquire_lock_point(tx_id, path) - if not success: - break - - if not success: - await self._tx_manager.rollback(tx_id) - raise LockAcquisitionError( - f"Failed to acquire {self._lock_mode} lock for {self._lock_paths}" - ) - - # Update journal with actual lock paths now populated in the record. - try: - self._tx_manager.journal.update(self._record.to_journal()) - except Exception as e: - logger.warning(f"[Transaction] Failed to update journal for {tx_id}: {e}") - - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - if not self._committed: - try: - await self._tx_manager.rollback(self._record.id) - except Exception as e: - logger.error(f"Rollback failed during __aexit__: {e}") - return False - - def record_undo(self, op_type: str, params: Dict[str, Any]) -> int: - seq = self._sequence - self._sequence += 1 - entry = UndoEntry(sequence=seq, op_type=op_type, params=params) - self.record.undo_log.append(entry) - - try: - self._tx_manager.journal.update(self.record.to_journal()) - except Exception as e: - logger.debug(f"[Transaction] Failed to persist journal: {e}") - - return seq - - def mark_completed(self, sequence: int) -> None: - for entry in self.record.undo_log: - if entry.sequence == sequence: - entry.completed = True - break - - try: - self._tx_manager.journal.update(self.record.to_journal()) - except Exception as e: - logger.debug(f"[Transaction] Failed to persist journal: {e}") - - def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None: - self.record.post_actions.append({"type": action_type, "params": params}) - - async def commit(self) -> None: - success = await self._tx_manager.commit(self._record.id) - if not success: - raise TransactionError(f"Failed to commit transaction {self._record.id}") - self._committed = True diff --git a/openviking/storage/transaction/journal.py b/openviking/storage/transaction/journal.py deleted file mode 100644 index 6cb14474..00000000 --- a/openviking/storage/transaction/journal.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Transaction journal for crash recovery. - -Persists transaction state to AGFS so that incomplete transactions can be -detected and recovered after a process restart. -""" - -import json -from typing import Any, Dict, List - -from openviking.pyagfs import AGFSClient -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - -# Journal root path (global, not behind VikingFS URI mapping) -_JOURNAL_ROOT = "/local/_system/transactions" - - -class TransactionJournal: - """Persists transaction records to AGFS for crash recovery. - - Journal files live at ``/local/_system/transactions/{tx_id}/journal.json``. - """ - - def __init__(self, agfs: AGFSClient): - self._agfs = agfs - - def _tx_dir(self, tx_id: str) -> str: - return f"{_JOURNAL_ROOT}/{tx_id}" - - def _journal_path(self, tx_id: str) -> str: - return f"{_JOURNAL_ROOT}/{tx_id}/journal.json" - - def _ensure_dir(self, path: str) -> None: - """Create directory, ignoring already-exists errors.""" - try: - self._agfs.mkdir(path) - except Exception as e: - logger.warning(f"[Journal] mkdir {path}: {e}") - - def write(self, data: Dict[str, Any]) -> None: - """Create a new journal entry for a transaction. - - Args: - data: Serialized transaction record (from TransactionRecord.to_journal()). - """ - tx_id = data["id"] - self._ensure_dir("/local/_system") - self._ensure_dir(_JOURNAL_ROOT) - self._ensure_dir(self._tx_dir(tx_id)) - payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8") - self._agfs.write(self._journal_path(tx_id), payload) - logger.info(f"[Journal] Written: {self._journal_path(tx_id)}") - - def update(self, data: Dict[str, Any]) -> None: - """Overwrite an existing journal entry. - - Args: - data: Updated serialized transaction record. - """ - tx_id = data["id"] - payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8") - self._agfs.write(self._journal_path(tx_id), payload) - - def read(self, tx_id: str) -> Dict[str, Any]: - """Read a journal entry. - - Args: - tx_id: Transaction ID. - - Returns: - Parsed journal data. - - Raises: - FileNotFoundError: If journal does not exist. - """ - content = self._agfs.cat(self._journal_path(tx_id)) - if isinstance(content, bytes): - content = content.decode("utf-8") - return json.loads(content) - - def delete(self, tx_id: str) -> None: - """Delete a transaction's journal directory. - - Args: - tx_id: Transaction ID. - """ - try: - self._agfs.rm(self._tx_dir(tx_id), recursive=True) - logger.debug(f"[Journal] Deleted journal for tx {tx_id}") - except Exception as e: - logger.warning(f"[Journal] Failed to delete journal for tx {tx_id}: {e}") - - def list_all(self) -> List[str]: - """List all transaction IDs that have journal entries. - - Returns: - List of transaction ID strings. - """ - try: - entries = self._agfs.ls(_JOURNAL_ROOT) - tx_ids = [] - if isinstance(entries, list): - for entry in entries: - name = entry.get("name", "") if isinstance(entry, dict) else str(entry) - if name and name not in (".", "..") and entry.get("isDir", True): - tx_ids.append(name) - return tx_ids - except Exception: - return [] diff --git a/openviking/storage/transaction/lock_context.py b/openviking/storage/transaction/lock_context.py new file mode 100644 index 00000000..62fc15ba --- /dev/null +++ b/openviking/storage/transaction/lock_context.py @@ -0,0 +1,68 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""LockContext — async context manager for acquiring/releasing path locks.""" + +from typing import Optional + +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.lock_handle import LockHandle +from openviking.storage.transaction.lock_manager import LockManager + + +class LockContext: + """``async with LockContext(manager, paths, mode) as handle: ...`` + + Acquires locks on entry, releases them on exit. No undo / journal / commit + semantics — just a lock scope. + """ + + def __init__( + self, + lock_manager: LockManager, + paths: list[str], + lock_mode: str = "point", + mv_dst_path: Optional[str] = None, + src_is_dir: bool = True, + ): + self._manager = lock_manager + self._paths = paths + self._lock_mode = lock_mode + self._mv_dst_path = mv_dst_path + self._src_is_dir = src_is_dir + self._handle: Optional[LockHandle] = None + + async def __aenter__(self) -> LockHandle: + self._handle = self._manager.create_handle() + success = False + + if self._lock_mode == "subtree": + for path in self._paths: + success = await self._manager.acquire_subtree(self._handle, path) + if not success: + break + elif self._lock_mode == "mv": + if self._mv_dst_path is None: + raise LockAcquisitionError("mv lock mode requires mv_dst_path") + success = await self._manager.acquire_mv( + self._handle, + self._paths[0], + self._mv_dst_path, + src_is_dir=self._src_is_dir, + ) + else: # "point" + for path in self._paths: + success = await self._manager.acquire_point(self._handle, path) + if not success: + break + + if not success: + await self._manager.release(self._handle) + raise LockAcquisitionError( + f"Failed to acquire {self._lock_mode} lock for {self._paths}" + ) + return self._handle + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self._handle: + await self._manager.release(self._handle) + return False diff --git a/openviking/storage/transaction/lock_handle.py b/openviking/storage/transaction/lock_handle.py new file mode 100644 index 00000000..7b5be5d9 --- /dev/null +++ b/openviking/storage/transaction/lock_handle.py @@ -0,0 +1,37 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Lock handle and LockOwner protocol for PathLock integration.""" + +import time +import uuid +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class LockOwner(Protocol): + """Minimal interface that PathLock requires from its caller.""" + + id: str + locks: list[str] + + def add_lock(self, path: str) -> None: ... + def remove_lock(self, path: str) -> None: ... + + +@dataclass +class LockHandle: + """Identifies a lock holder. PathLock uses ``id`` to generate fencing tokens + and ``locks`` to track acquired lock files.""" + + id: str = field(default_factory=lambda: str(uuid.uuid4())) + locks: list[str] = field(default_factory=list) + created_at: float = field(default_factory=time.time) + + def add_lock(self, lock_path: str) -> None: + if lock_path not in self.locks: + self.locks.append(lock_path) + + def remove_lock(self, lock_path: str) -> None: + if lock_path in self.locks: + self.locks.remove(lock_path) diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py new file mode 100644 index 00000000..5e2e5076 --- /dev/null +++ b/openviking/storage/transaction/lock_manager.py @@ -0,0 +1,247 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""LockManager — global singleton managing lock lifecycle and redo recovery.""" + +import asyncio +import json +import time +from typing import Any, Dict, Optional + +from openviking.pyagfs import AGFSClient +from openviking.storage.transaction.lock_handle import LockHandle +from openviking.storage.transaction.path_lock import PathLock +from openviking.storage.transaction.redo_log import RedoLog +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class LockManager: + """Global singleton. Manages lock lifecycle and stale cleanup.""" + + def __init__( + self, + agfs: AGFSClient, + lock_timeout: float = 0.0, + lock_expire: float = 300.0, + ): + self._agfs = agfs + self._path_lock = PathLock(agfs, lock_expire=lock_expire) + self._lock_timeout = lock_timeout + self._redo_log = RedoLog(agfs) + self._handles: Dict[str, LockHandle] = {} + self._cleanup_task: Optional[asyncio.Task] = None + self._running = False + + @property + def redo_log(self) -> RedoLog: + return self._redo_log + + def get_active_handles(self) -> Dict[str, LockHandle]: + return dict(self._handles) + + async def start(self) -> None: + """Start background cleanup and redo recovery.""" + self._running = True + self._cleanup_task = asyncio.create_task(self._stale_cleanup_loop()) + await self._recover_pending_redo() + + async def stop(self) -> None: + """Stop cleanup and release all active locks.""" + self._running = False + if self._cleanup_task: + self._cleanup_task.cancel() + try: + await self._cleanup_task + except asyncio.CancelledError: + pass + for handle in list(self._handles.values()): + await self._path_lock.release(handle) + self._handles.clear() + + def create_handle(self) -> LockHandle: + handle = LockHandle() + self._handles[handle.id] = handle + return handle + + async def acquire_point( + self, handle: LockHandle, path: str, timeout: Optional[float] = None + ) -> bool: + return await self._path_lock.acquire_point( + path, handle, timeout=timeout if timeout is not None else self._lock_timeout + ) + + async def acquire_subtree( + self, handle: LockHandle, path: str, timeout: Optional[float] = None + ) -> bool: + return await self._path_lock.acquire_subtree( + path, handle, timeout=timeout if timeout is not None else self._lock_timeout + ) + + async def acquire_mv( + self, + handle: LockHandle, + src: str, + dst: str, + src_is_dir: bool = True, + timeout: Optional[float] = None, + ) -> bool: + return await self._path_lock.acquire_mv( + src, + dst, + handle, + timeout=timeout if timeout is not None else self._lock_timeout, + src_is_dir=src_is_dir, + ) + + async def release(self, handle: LockHandle) -> None: + await self._path_lock.release(handle) + self._handles.pop(handle.id, None) + + async def _stale_cleanup_loop(self) -> None: + """Check and release leaked handles every 60 s (in-process safety net).""" + while self._running: + await asyncio.sleep(60) + now = time.time() + stale = [h for h in self._handles.values() if now - h.created_at > 3600] + for handle in stale: + logger.warning(f"Releasing stale lock handle {handle.id}") + await self.release(handle) + + # ------------------------------------------------------------------ + # Redo recovery (session_memory only) + # ------------------------------------------------------------------ + + async def _recover_pending_redo(self) -> None: + pending_ids = self._redo_log.list_pending() + for task_id in pending_ids: + logger.info(f"Recovering pending redo task: {task_id}") + try: + info = self._redo_log.read(task_id) + if info: + await self._redo_session_memory(info) + self._redo_log.mark_done(task_id) + except Exception as e: + logger.error(f"Redo recovery failed for {task_id}: {e}", exc_info=True) + + async def _redo_session_memory(self, info: Dict[str, Any]) -> None: + """Re-extract memories from archive.""" + from openviking.message import Message + from openviking.server.identity import RequestContext, Role + from openviking.session.compressor import SessionCompressor + from openviking_cli.session.user_id import UserIdentifier + + archive_uri = info.get("archive_uri") + session_uri = info.get("session_uri") + account_id = info.get("account_id", "default") + user_id = info.get("user_id", "default") + agent_id = info.get("agent_id", "default") + role_str = info.get("role", "root") + + if not archive_uri or not session_uri: + logger.warning("Cannot redo session_memory: missing archive_uri or session_uri") + return + + # 1. Read archived messages + messages_path = f"{archive_uri}/messages.jsonl" + try: + agfs_path = messages_path.replace("viking://", "") + content = self._agfs.cat(agfs_path) + if isinstance(content, bytes): + content = content.decode("utf-8") + except Exception as e: + logger.warning(f"Cannot read archive for redo: {messages_path}: {e}") + return + + messages = [] + for line in content.strip().split("\n"): + if line.strip(): + try: + messages.append(Message.from_dict(json.loads(line))) + except Exception: + pass + + if not messages: + logger.warning(f"No messages found in archive for redo: {archive_uri}") + return + + # 2. Build request context + user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id) + ctx = RequestContext(user=user, role=Role(role_str)) + + # 3. Re-extract memories (best-effort: skip if compressor not available) + session_id = session_uri.rstrip("/").rsplit("/", 1)[-1] + try: + compressor = SessionCompressor(vikingdb=None) + memories = await compressor.extract_long_term_memories( + messages=messages, + user=user, + session_id=session_id, + ctx=ctx, + ) + logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}") + except Exception as e: + logger.warning(f"Redo: memory extraction skipped ({e}), will retry via queue") + + # 4. Enqueue semantic processing + await self._enqueue_semantic( + uri=session_uri, + context_type="memory", + account_id=account_id, + user_id=user_id, + agent_id=agent_id, + role=role_str, + ) + + async def _enqueue_semantic(self, **params: Any) -> None: + from openviking.storage.queuefs import get_queue_manager + from openviking.storage.queuefs.semantic_msg import SemanticMsg + from openviking.storage.queuefs.semantic_queue import SemanticQueue + + queue_manager = get_queue_manager() + if queue_manager is None: + logger.debug("No queue manager available, skipping enqueue_semantic") + return + + uri = params.get("uri") + if not uri: + return + + msg = SemanticMsg( + uri=uri, + context_type=params.get("context_type", "resource"), + account_id=params.get("account_id", "default"), + user_id=params.get("user_id", "default"), + agent_id=params.get("agent_id", "default"), + role=params.get("role", "root"), + ) + semantic_queue: SemanticQueue = queue_manager.get_queue(queue_manager.SEMANTIC) # type: ignore[assignment] + await semantic_queue.enqueue(msg) + + +# --------------------------------------------------------------------------- +# Module-level singleton +# --------------------------------------------------------------------------- + +_lock_manager: Optional[LockManager] = None + + +def init_lock_manager( + agfs: AGFSClient, + lock_timeout: float = 0.0, + lock_expire: float = 300.0, +) -> LockManager: + global _lock_manager + _lock_manager = LockManager(agfs=agfs, lock_timeout=lock_timeout, lock_expire=lock_expire) + return _lock_manager + + +def get_lock_manager() -> LockManager: + if _lock_manager is None: + raise RuntimeError("LockManager not initialized. Call init_lock_manager() first.") + return _lock_manager + + +def reset_lock_manager() -> None: + global _lock_manager + _lock_manager = None diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index 5de99743..d9212b3b 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -3,7 +3,7 @@ from typing import Optional, Tuple from openviking.pyagfs import AGFSClient -from openviking.storage.transaction.transaction_record import TransactionRecord +from openviking.storage.transaction.lock_handle import LockOwner from openviking_cli.utils.logger import get_logger logger = get_logger(__name__) @@ -19,8 +19,8 @@ _POLL_INTERVAL = 0.2 -def _make_fencing_token(tx_id: str, lock_type: str = LOCK_TYPE_POINT) -> str: - return f"{tx_id}:{time.time_ns()}:{lock_type}" +def _make_fencing_token(owner_id: str, lock_type: str = LOCK_TYPE_POINT) -> str: + return f"{owner_id}:{time.time_ns()}:{lock_type}" def _parse_fencing_token(token: str) -> Tuple[str, int, str]: @@ -29,20 +29,20 @@ def _parse_fencing_token(token: str) -> Tuple[str, int, str]: rest = token[:-2] idx = rest.rfind(":") if idx >= 0: - tx_id_part = rest[:idx] + owner_id_part = rest[:idx] ts_part = rest[idx + 1 :] try: - return tx_id_part, int(ts_part), lock_type + return owner_id_part, int(ts_part), lock_type except ValueError: pass return rest, 0, lock_type if ":" in token: idx = token.rfind(":") - tx_id_part = token[:idx] + owner_id_part = token[:idx] ts_part = token[idx + 1 :] try: - return tx_id_part, int(ts_part), LOCK_TYPE_POINT + return owner_id_part, int(ts_part), LOCK_TYPE_POINT except ValueError: pass @@ -76,25 +76,25 @@ def _read_token(self, lock_path: str) -> Optional[str]: except Exception: return None - async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool: + async def _is_locked_by_other(self, lock_path: str, owner_id: str) -> bool: token = self._read_token(lock_path) if token is None: return False lock_owner, _, _ = _parse_fencing_token(token) - return lock_owner != transaction_id + return lock_owner != owner_id async def _create_lock_file( - self, lock_path: str, transaction_id: str, lock_type: str = LOCK_TYPE_POINT + self, lock_path: str, owner_id: str, lock_type: str = LOCK_TYPE_POINT ) -> None: - token = _make_fencing_token(transaction_id, lock_type) + token = _make_fencing_token(owner_id, lock_type) self._agfs.write(lock_path, token.encode("utf-8")) - async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool: + async def _verify_lock_ownership(self, lock_path: str, owner_id: str) -> bool: token = self._read_token(lock_path) if token is None: return False lock_owner, _, _ = _parse_fencing_token(token) - return lock_owner == transaction_id + return lock_owner == owner_id async def _remove_lock_file(self, lock_path: str) -> bool: try: @@ -115,19 +115,19 @@ def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool: age = (time.time_ns() - ts) / 1e9 return age > expire_seconds - async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> Optional[str]: + async def _check_ancestors_for_subtree(self, path: str, exclude_owner_id: str) -> Optional[str]: parent = self._get_parent_path(path) while parent: lock_path = self._get_lock_path(parent) token = self._read_token(lock_path) if token is not None: owner_id, _, lock_type = _parse_fencing_token(token) - if owner_id != exclude_tx_id and lock_type == LOCK_TYPE_SUBTREE: + if owner_id != exclude_owner_id and lock_type == LOCK_TYPE_SUBTREE: return lock_path parent = self._get_parent_path(parent) return None - async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Optional[str]: + async def _scan_descendants_for_locks(self, path: str, exclude_owner_id: str) -> Optional[str]: try: entries = self._agfs.ls(path) if not isinstance(entries, list): @@ -145,19 +145,17 @@ async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Op token = self._read_token(subdir_lock) if token is not None: owner_id, _, _ = _parse_fencing_token(token) - if owner_id != exclude_tx_id: + if owner_id != exclude_owner_id: return subdir_lock - result = await self._scan_descendants_for_locks(subdir, exclude_tx_id) + result = await self._scan_descendants_for_locks(subdir, exclude_owner_id) if result: return result except Exception as e: logger.warning(f"Failed to scan descendants of {path}: {e}") return None - async def acquire_point( - self, path: str, transaction: TransactionRecord, timeout: float = 0.0 - ) -> bool: - transaction_id = transaction.id + async def acquire_point(self, path: str, owner: LockOwner, timeout: float = 0.0) -> bool: + owner_id = owner.id lock_path = self._get_lock_path(path) deadline = asyncio.get_running_loop().time() + timeout @@ -168,7 +166,7 @@ async def acquire_point( return False while True: - if await self._is_locked_by_other(lock_path, transaction_id): + if await self._is_locked_by_other(lock_path, owner_id): if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[POINT] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) @@ -179,7 +177,7 @@ async def acquire_point( await asyncio.sleep(_POLL_INTERVAL) continue - ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id) + ancestor_conflict = await self._check_ancestors_for_subtree(path, owner_id) if ancestor_conflict: if self.is_lock_stale(ancestor_conflict, self._lock_expire): logger.warning( @@ -196,22 +194,22 @@ async def acquire_point( continue try: - await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_POINT) + await self._create_lock_file(lock_path, owner_id, LOCK_TYPE_POINT) except Exception as e: logger.error(f"[POINT] Failed to create lock file: {e}") return False backed_off = False - conflict_after = await self._check_ancestors_for_subtree(path, transaction_id) + conflict_after = await self._check_ancestors_for_subtree(path, owner_id) if conflict_after: their_token = self._read_token(conflict_after) if their_token: - their_tx_id, their_ts, _ = _parse_fencing_token(their_token) + their_owner_id, their_ts, _ = _parse_fencing_token(their_token) my_token = self._read_token(lock_path) _, my_ts, _ = ( _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT) ) - if (my_ts, transaction_id) > (their_ts, their_tx_id): + if (my_ts, owner_id) > (their_ts, their_owner_id): logger.debug(f"[POINT] Backing off (livelock guard) on {path}") await self._remove_lock_file(lock_path) backed_off = True @@ -222,21 +220,19 @@ async def acquire_point( await asyncio.sleep(_POLL_INTERVAL) continue - if not await self._verify_lock_ownership(lock_path, transaction_id): + if not await self._verify_lock_ownership(lock_path, owner_id): logger.debug(f"[POINT] Lock ownership verification failed: {path}") if asyncio.get_running_loop().time() >= deadline: return False await asyncio.sleep(_POLL_INTERVAL) continue - transaction.add_lock(lock_path) + owner.add_lock(lock_path) logger.debug(f"[POINT] Lock acquired: {lock_path}") return True - async def acquire_subtree( - self, path: str, transaction: TransactionRecord, timeout: float = 0.0 - ) -> bool: - transaction_id = transaction.id + async def acquire_subtree(self, path: str, owner: LockOwner, timeout: float = 0.0) -> bool: + owner_id = owner.id lock_path = self._get_lock_path(path) deadline = asyncio.get_running_loop().time() + timeout @@ -247,7 +243,7 @@ async def acquire_subtree( return False while True: - if await self._is_locked_by_other(lock_path, transaction_id): + if await self._is_locked_by_other(lock_path, owner_id): if self.is_lock_stale(lock_path, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}") await self._remove_lock_file(lock_path) @@ -258,8 +254,8 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - # Check ancestor paths for SUBTREE locks held by other transactions - ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id) + # Check ancestor paths for SUBTREE locks held by other owners + ancestor_conflict = await self._check_ancestors_for_subtree(path, owner_id) if ancestor_conflict: if self.is_lock_stale(ancestor_conflict, self._lock_expire): logger.warning( @@ -275,7 +271,7 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - desc_conflict = await self._scan_descendants_for_locks(path, transaction_id) + desc_conflict = await self._scan_descendants_for_locks(path, owner_id) if desc_conflict: if self.is_lock_stale(desc_conflict, self._lock_expire): logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}") @@ -290,24 +286,24 @@ async def acquire_subtree( continue try: - await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_SUBTREE) + await self._create_lock_file(lock_path, owner_id, LOCK_TYPE_SUBTREE) except Exception as e: logger.error(f"[SUBTREE] Failed to create lock file: {e}") return False backed_off = False - conflict_after = await self._scan_descendants_for_locks(path, transaction_id) + conflict_after = await self._scan_descendants_for_locks(path, owner_id) if not conflict_after: - conflict_after = await self._check_ancestors_for_subtree(path, transaction_id) + conflict_after = await self._check_ancestors_for_subtree(path, owner_id) if conflict_after: their_token = self._read_token(conflict_after) if their_token: - their_tx_id, their_ts, _ = _parse_fencing_token(their_token) + their_owner_id, their_ts, _ = _parse_fencing_token(their_token) my_token = self._read_token(lock_path) _, my_ts, _ = ( _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE) ) - if (my_ts, transaction_id) > (their_ts, their_tx_id): + if (my_ts, owner_id) > (their_ts, their_owner_id): logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}") await self._remove_lock_file(lock_path) backed_off = True @@ -318,14 +314,14 @@ async def acquire_subtree( await asyncio.sleep(_POLL_INTERVAL) continue - if not await self._verify_lock_ownership(lock_path, transaction_id): + if not await self._verify_lock_ownership(lock_path, owner_id): logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}") if asyncio.get_running_loop().time() >= deadline: return False await asyncio.sleep(_POLL_INTERVAL) continue - transaction.add_lock(lock_path) + owner.add_lock(lock_path) logger.debug(f"[SUBTREE] Lock acquired: {lock_path}") return True @@ -333,35 +329,35 @@ async def acquire_mv( self, src_path: str, dst_path: str, - transaction: TransactionRecord, + owner: LockOwner, timeout: float = 0.0, src_is_dir: bool = True, ) -> bool: if src_is_dir: - if not await self.acquire_subtree(src_path, transaction, timeout=timeout): + if not await self.acquire_subtree(src_path, owner, timeout=timeout): logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}") return False - if not await self.acquire_subtree(dst_path, transaction, timeout=timeout): + if not await self.acquire_subtree(dst_path, owner, timeout=timeout): logger.warning(f"[MV] Failed to acquire SUBTREE lock on destination: {dst_path}") - await self.release(transaction) + await self.release(owner) return False else: src_parent = src_path.rsplit("/", 1)[0] if "/" in src_path else src_path - if not await self.acquire_point(src_parent, transaction, timeout=timeout): + if not await self.acquire_point(src_parent, owner, timeout=timeout): logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}") return False - if not await self.acquire_point(dst_path, transaction, timeout=timeout): + if not await self.acquire_point(dst_path, owner, timeout=timeout): logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}") - await self.release(transaction) + await self.release(owner) return False logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}") return True - async def release(self, transaction: TransactionRecord) -> None: - lock_count = len(transaction.locks) - for lock_path in reversed(transaction.locks): + async def release(self, owner: LockOwner) -> None: + lock_count = len(owner.locks) + for lock_path in reversed(owner.locks): await self._remove_lock_file(lock_path) - transaction.remove_lock(lock_path) + owner.remove_lock(lock_path) - logger.debug(f"Released {lock_count} locks for transaction {transaction.id}") + logger.debug(f"Released {lock_count} locks for owner {owner.id}") diff --git a/openviking/storage/transaction/redo_log.py b/openviking/storage/transaction/redo_log.py new file mode 100644 index 00000000..80d07dff --- /dev/null +++ b/openviking/storage/transaction/redo_log.py @@ -0,0 +1,76 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Lightweight redo log for crash recovery of session_memory operations.""" + +import json +from typing import Any, Dict, List + +from openviking.pyagfs import AGFSClient +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + +_REDO_ROOT = "/local/_system/redo" + + +class RedoLog: + """Lightweight pending-task marker. + + Write a marker before the operation starts; delete it after success. + On startup, scan for leftover markers and redo. + """ + + def __init__(self, agfs: AGFSClient): + self._agfs = agfs + + def _task_path(self, task_id: str) -> str: + return f"{_REDO_ROOT}/{task_id}/redo.json" + + def _ensure_dirs(self, dir_path: str) -> None: + parts = dir_path.strip("/").split("/") + current = "" + for part in parts: + current = f"{current}/{part}" + try: + self._agfs.mkdir(current) + except Exception: + pass + + def write_pending(self, task_id: str, info: Dict[str, Any]) -> None: + """Write a redo marker before the operation starts.""" + dir_path = f"{_REDO_ROOT}/{task_id}" + self._ensure_dirs(dir_path) + data = json.dumps(info, default=str).encode("utf-8") + self._agfs.write(self._task_path(task_id), data) + + def mark_done(self, task_id: str) -> None: + """Delete the redo marker after a successful operation.""" + try: + self._agfs.rm(f"{_REDO_ROOT}/{task_id}", recursive=True) + except Exception as e: + logger.warning(f"Failed to clean redo marker {task_id}: {e}") + + def list_pending(self) -> List[str]: + """Return all pending task IDs (directories under _REDO_ROOT).""" + try: + entries = self._agfs.ls(_REDO_ROOT) + if not isinstance(entries, list): + return [] + return [ + e["name"] + for e in entries + if isinstance(e, dict) and e.get("isDir") and e.get("name") not in (".", "..") + ] + except Exception: + return [] + + def read(self, task_id: str) -> Dict[str, Any]: + """Read the info dict of a pending task.""" + try: + content = self._agfs.cat(self._task_path(task_id)) + if isinstance(content, bytes): + content = content.decode("utf-8") + return json.loads(content) + except Exception as e: + logger.warning(f"Failed to read redo info for {task_id}: {e}") + return {} diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py deleted file mode 100644 index e80df09b..00000000 --- a/openviking/storage/transaction/transaction_manager.py +++ /dev/null @@ -1,739 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Transaction manager for OpenViking. - -Global singleton that manages transaction lifecycle and lock mechanisms. -""" - -import asyncio -import threading -import time -from typing import Any, Dict, List, Optional - -from openviking.pyagfs import AGFSClient -from openviking.storage.transaction.path_lock import PathLock -from openviking.storage.transaction.transaction_record import ( - TransactionRecord, - TransactionStatus, -) -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - -# Global singleton instance -_transaction_manager: Optional["TransactionManager"] = None -_lock = threading.Lock() - - -class TransactionManager: - """Transaction manager for OpenViking. - - Global singleton that manages transaction lifecycle and lock mechanisms. - Responsible for: - - Allocating transaction IDs - - Managing transaction lifecycle (start, commit, rollback) - - Providing transaction lock mechanism interface, preventing deadlocks - - Persisting transaction state to journal for crash recovery - """ - - def __init__( - self, - agfs_client: AGFSClient, - timeout: int = 3600, - max_parallel_locks: int = 8, - lock_timeout: float = 0.0, - lock_expire: float = 300.0, - vector_store: Optional[Any] = None, - ): - """Initialize transaction manager. - - Args: - agfs_client: AGFS client for file system operations - timeout: Transaction timeout in seconds (default: 3600) - max_parallel_locks: Maximum number of parallel lock operations (default: 8) - lock_timeout: Path lock acquisition timeout in seconds. - 0 (default) = fail immediately if locked. - > 0 = wait/retry up to this many seconds. - lock_expire: Stale lock expiry threshold in seconds (default: 300s). - vector_store: Optional vector store for VectorDB rollback operations. - """ - from openviking.storage.transaction.journal import TransactionJournal - - self._agfs = agfs_client - self._timeout = timeout - self._max_parallel_locks = max_parallel_locks - self._lock_timeout = lock_timeout - self._vector_store = vector_store - self._path_lock = PathLock(agfs_client, lock_expire=lock_expire) - self._journal = TransactionJournal(agfs_client) - - # Active transactions: {transaction_id: TransactionRecord} - self._transactions: Dict[str, TransactionRecord] = {} - - # Background task for timeout cleanup - self._cleanup_task: Optional[asyncio.Task] = None - self._running = False - - logger.info( - f"TransactionManager initialized (timeout={timeout}s, max_parallel_locks={max_parallel_locks})" - ) - - @property - def journal(self): - return self._journal - - async def start(self) -> None: - """Start transaction manager. - - Starts the background cleanup task and recovers any pending transactions - left from a previous process crash. - """ - if self._running: - logger.debug("TransactionManager already running") - return - - self._running = True - self._cleanup_task = asyncio.create_task(self._cleanup_loop()) - - # Recover any transactions that were interrupted by a previous crash. - # Journal entries are written BEFORE lock acquisition, so every orphan - # lock has a corresponding journal entry that recovery can use to clean it up. - await self._recover_pending_transactions() - - logger.info("TransactionManager started") - - async def stop(self) -> None: - """Stop transaction manager. - - Stops the background cleanup task and releases all resources. - """ - if not self._running: - logger.debug("TransactionManager already stopped") - return - - self._running = False - - # Cancel cleanup task - if self._cleanup_task: - self._cleanup_task.cancel() - try: - await self._cleanup_task - except asyncio.CancelledError: - pass - self._cleanup_task = None - - # Release all active transactions' locks - for tx_id in list(self._transactions.keys()): - tx = self._transactions.pop(tx_id, None) - if tx: - await self._path_lock.release(tx) - - logger.info("TransactionManager stopped") - - async def _cleanup_loop(self) -> None: - """Background loop for cleaning up timed-out transactions.""" - while self._running: - try: - await asyncio.sleep(60) # Check every minute - await self._cleanup_timed_out() - except asyncio.CancelledError: - break - except Exception as e: - logger.error(f"Error in cleanup loop: {e}") - - async def _cleanup_timed_out(self) -> None: - """Clean up timed-out transactions.""" - current_time = time.time() - timed_out = [] - - for tx_id, tx in self._transactions.items(): - if current_time - tx.updated_at > self._timeout: - timed_out.append(tx_id) - - for tx_id in timed_out: - logger.warning(f"Transaction timed out: {tx_id}") - await self.rollback(tx_id) - - async def _recover_pending_transactions(self) -> None: - """Recover pending transactions from journal after a crash. - - Reads all journal entries and rolls back any transactions that were - not cleanly committed or rolled back. - """ - try: - pending_ids = self._journal.list_all() - except Exception as e: - logger.warning(f"Failed to list journal entries for recovery: {e}") - return - - if not pending_ids: - return - - logger.info(f"Found {len(pending_ids)} pending transaction(s) to recover") - - for tx_id in pending_ids: - try: - await self._recover_one(tx_id) - except Exception as e: - logger.error(f"Failed to recover transaction {tx_id}: {e}") - - async def _recover_one(self, tx_id: str) -> None: - """Recover a single transaction from journal. - - Recovery strategy by status: - COMMITTED + post_actions → replay post_actions (enqueue etc.), then clean up - COMMITTED, no post_actions / RELEASED → just clean up - EXEC / FAIL / RELEASING → rollback completed+partial ops, then clean up - INIT / ACQUIRE → nothing executed yet, just clean up - """ - from openviking.storage.transaction.undo import execute_rollback - - try: - data = self._journal.read(tx_id) - except Exception as e: - logger.warning(f"Cannot read journal for tx {tx_id}: {e}") - return - - tx = TransactionRecord.from_journal(data) - logger.info(f"Recovering transaction {tx_id} (status={tx.status})") - - if tx.status == TransactionStatus.COMMIT: - # Transaction was committed — replay any unfinished post_actions - if tx.post_actions: - logger.info( - f"Replaying {len(tx.post_actions)} post_action(s) for committed tx {tx_id}" - ) - try: - await self._execute_post_actions(tx.post_actions) - except Exception as e: - logger.warning(f"Post-action replay failed for tx {tx_id}: {e}") - elif tx.status in (TransactionStatus.INIT, TransactionStatus.ACQUIRE): - # Transaction never executed any operations — nothing to rollback. - # However, locks may have been created before the journal was updated - # with the actual locks list. Use init_info.lock_paths to find and - # clean up orphan lock files owned by this transaction. - logger.info(f"Transaction {tx_id} never executed, cleaning up orphan locks") - if not tx.locks: - await self._cleanup_orphan_locks_from_init_info(tx_id, tx.init_info) - else: - # EXEC / FAIL / RELEASING: process crashed mid-operation - operation = tx.init_info.get("operation", "") - if operation == "session_memory": - # Redo: re-extract memories from archive and write - try: - await self._redo_session_memory(tx) - except Exception as e: - logger.warning(f"Redo session_memory failed for tx {tx_id}: {e}") - else: - # Default: rollback completed+partial ops - # Pass recover_all=True so partial (completed=False) ops are also reversed, - # e.g. a directory mv that started but never finished still leaves residue. - try: - await execute_rollback( - tx.undo_log, - self._agfs, - vector_store=self._vector_store, - recover_all=True, - ) - except Exception as e: - logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}") - - # Release any lock files still present - await self._path_lock.release(tx) - - # Clean up journal - try: - self._journal.delete(tx_id) - except Exception: - pass - - logger.info(f"Recovered transaction {tx_id}") - - async def _cleanup_orphan_locks_from_init_info( - self, tx_id: str, init_info: Dict[str, Any] - ) -> None: - """Clean up orphan lock files using lock path hints from init_info. - - When a crash occurs between lock creation and journal update, the - journal's ``locks`` list is empty but ``init_info.lock_paths`` records - the paths that were intended to be locked. This method checks those - paths and removes any lock files still owned by this transaction. - """ - from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _parse_fencing_token - - lock_paths = init_info.get("lock_paths", []) - lock_mode = init_info.get("lock_mode", "point") - mv_dst_path = init_info.get("mv_dst_path") - - # Collect all candidate paths to check - paths_to_check = list(lock_paths) - if lock_mode == "mv" and mv_dst_path: - paths_to_check.append(mv_dst_path) - - for path in paths_to_check: - lock_file = f"{path.rstrip('/')}/{LOCK_FILE_NAME}" - try: - token = self._path_lock._read_token(lock_file) - if token is None: - continue - owner_id, _, _ = _parse_fencing_token(token) - if owner_id == tx_id: - await self._path_lock._remove_lock_file(lock_file) - logger.info(f"Removed orphan lock for tx {tx_id}: {lock_file}") - except Exception as e: - logger.warning(f"Failed to check orphan lock {lock_file}: {e}") - - async def _redo_session_memory(self, tx: TransactionRecord) -> None: - """Redo a session_memory transaction from its archived messages. - - On crash during Phase 2 of session commit, we redo memory extraction - from the archive rather than rolling back. - """ - import json - - from openviking.message import Message - from openviking.server.identity import RequestContext, Role - from openviking_cli.session.user_id import UserIdentifier - - archive_uri = tx.init_info.get("archive_uri") - session_uri = tx.init_info.get("session_uri") - account_id = tx.init_info.get("account_id", "default") - user_id = tx.init_info.get("user_id", "default") - agent_id = tx.init_info.get("agent_id", "default") - role_str = tx.init_info.get("role", "root") - - if not archive_uri or not session_uri: - logger.warning("Cannot redo session_memory: missing archive_uri or session_uri") - return - - # 1. Read archived messages from AGFS - messages_path = f"{archive_uri}/messages.jsonl" - try: - agfs_path = messages_path.replace("viking://", "") - content = self._agfs.cat(agfs_path) - if isinstance(content, bytes): - content = content.decode("utf-8") - except Exception as e: - logger.warning(f"Cannot read archive for redo: {messages_path}: {e}") - return - - messages = [] - for line in content.strip().split("\n"): - if line.strip(): - try: - messages.append(Message.from_dict(json.loads(line))) - except Exception: - pass - - if not messages: - logger.warning(f"No messages found in archive for redo: {archive_uri}") - return - - # 2. Build request context for memory extraction - user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id) - ctx = RequestContext(user=user, role=Role(role_str), account_id=account_id) - - # 3. Re-extract memories - from openviking.session.compressor import SessionCompressor - - compressor = SessionCompressor() - session_id = session_uri.rstrip("/").rsplit("/", 1)[-1] - memories = await compressor.extract_long_term_memories( - messages=messages, - user=user, - session_id=session_id, - ctx=ctx, - ) - logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}") - - # 4. Enqueue semantic processing - await self._execute_post_actions( - [ - { - "type": "enqueue_semantic", - "params": { - "uri": session_uri, - "context_type": "memory", - "account_id": account_id, - "user_id": user_id, - "agent_id": agent_id, - "role": role_str, - }, - } - ] - ) - - def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord: - """Create a new transaction. - - Args: - init_info: Transaction initialization information - - Returns: - New transaction record - """ - tx = TransactionRecord(init_info=init_info or {}) - self._transactions[tx.id] = tx - logger.debug(f"Transaction created: {tx.id}") - return tx - - def get_transaction(self, transaction_id: str) -> Optional[TransactionRecord]: - """Get transaction by ID. - - Args: - transaction_id: Transaction ID - - Returns: - Transaction record or None if not found - """ - return self._transactions.get(transaction_id) - - async def begin(self, transaction_id: str) -> bool: - """Begin a transaction. - - Args: - transaction_id: Transaction ID - - Returns: - True if transaction started successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.ACQUIRE) - logger.debug(f"Transaction begun: {transaction_id}") - return True - - async def commit(self, transaction_id: str) -> bool: - """Commit a transaction. - - Executes post-actions, releases all locks, and removes the journal entry. - - Args: - transaction_id: Transaction ID - - Returns: - True if transaction committed successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - # Update status to COMMIT - tx.update_status(TransactionStatus.COMMIT) - - # Persist final committed state before releasing - try: - self._journal.update(tx.to_journal()) - except Exception: - pass - - # Execute post-actions (best-effort, errors are logged but don't fail commit) - if tx.post_actions: - await self._execute_post_actions(tx.post_actions) - - # Release all locks - tx.update_status(TransactionStatus.RELEASING) - await self._path_lock.release(tx) - - # Update status to RELEASED - tx.update_status(TransactionStatus.RELEASED) - - # Remove from active transactions - self._transactions.pop(transaction_id, None) - - # Clean up journal entry (last step — lock is already released) - try: - self._journal.delete(transaction_id) - except Exception as e: - logger.warning(f"Failed to delete journal on commit for {transaction_id}: {e}") - - logger.debug(f"Transaction committed: {transaction_id}") - return True - - async def rollback(self, transaction_id: str) -> bool: - """Rollback a transaction. - - Executes undo log entries in reverse order, releases all locks, - and removes the journal entry. - - Args: - transaction_id: Transaction ID - - Returns: - True if transaction rolled back successfully, False otherwise - """ - from openviking.storage.transaction.undo import execute_rollback - - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - # Update status to FAIL - tx.update_status(TransactionStatus.FAIL) - - # Persist rollback state - try: - self._journal.update(tx.to_journal()) - except Exception: - pass - - # Execute undo log (best-effort) - if tx.undo_log: - try: - await execute_rollback( - tx.undo_log, - self._agfs, - vector_store=self._vector_store, - ) - except Exception as e: - logger.warning( - f"Undo log execution failed during rollback of {transaction_id}: {e}" - ) - - # Release all locks - tx.update_status(TransactionStatus.RELEASING) - await self._path_lock.release(tx) - - # Update status to RELEASED - tx.update_status(TransactionStatus.RELEASED) - - # Remove from active transactions - self._transactions.pop(transaction_id, None) - - # Clean up journal entry (last step — lock is already released) - try: - self._journal.delete(transaction_id) - except Exception as e: - logger.warning(f"Failed to delete journal on rollback for {transaction_id}: {e}") - - logger.debug(f"Transaction rolled back: {transaction_id}") - return True - - async def _execute_post_actions(self, post_actions: List[Dict[str, Any]]) -> None: - """Execute post-commit actions. - - Post-actions are executed after a successful commit. Errors are logged - but do not affect the commit outcome. - - Args: - post_actions: List of post-action dicts with 'type' and 'params' keys - """ - for action in post_actions: - action_type = action.get("type", "") - params = action.get("params", {}) - try: - if action_type == "enqueue_semantic": - await self._post_enqueue_semantic(params) - else: - logger.warning(f"Unknown post-action type: {action_type}") - except Exception as e: - logger.warning(f"Post-action '{action_type}' failed: {e}") - - async def _post_enqueue_semantic(self, params: Dict[str, Any]) -> None: - """Execute enqueue_semantic post-action.""" - from openviking.storage.queuefs import get_queue_manager - from openviking.storage.queuefs.semantic_msg import SemanticMsg - - queue_manager = get_queue_manager() - if queue_manager is None: - logger.debug("No queue manager available, skipping enqueue_semantic post-action") - return - - uri = params.get("uri") - context_type = params.get("context_type", "resource") - account_id = params.get("account_id", "default") - user_id = params.get("user_id", "default") - agent_id = params.get("agent_id", "default") - role = params.get("role", "root") - if not uri: - return - - msg = SemanticMsg( - uri=uri, - context_type=context_type, - account_id=account_id, - user_id=user_id, - agent_id=agent_id, - role=role, - ) - semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC) - await semantic_queue.enqueue(msg) - - async def acquire_lock_point(self, transaction_id: str, path: str) -> bool: - """Acquire POINT lock for write/semantic-processing operations. - - Args: - transaction_id: Transaction ID - path: Directory path to lock - - Returns: - True if lock acquired successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.ACQUIRE) - success = await self._path_lock.acquire_point(path, tx, timeout=self._lock_timeout) - - if success: - tx.update_status(TransactionStatus.EXEC) - else: - tx.update_status(TransactionStatus.FAIL) - - return success - - async def acquire_lock_subtree( - self, transaction_id: str, path: str, timeout: Optional[float] = None - ) -> bool: - """Acquire SUBTREE lock for rm/mv-source operations. - - Args: - transaction_id: Transaction ID - path: Directory path to lock (root of the subtree) - timeout: Maximum time to wait for the lock in seconds (default: from config) - - Returns: - True if lock acquired successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.ACQUIRE) - effective_timeout = timeout if timeout is not None else self._lock_timeout - success = await self._path_lock.acquire_subtree(path, tx, timeout=effective_timeout) - - if success: - tx.update_status(TransactionStatus.EXEC) - else: - tx.update_status(TransactionStatus.FAIL) - - return success - - async def acquire_lock_mv( - self, - transaction_id: str, - src_path: str, - dst_path: str, - timeout: Optional[float] = None, - src_is_dir: bool = True, - ) -> bool: - """Acquire path lock for mv operation. - - Args: - transaction_id: Transaction ID - src_path: Source path - dst_path: Destination parent directory path - timeout: Maximum time to wait for each lock in seconds (default: from config) - src_is_dir: Whether the source is a directory - - Returns: - True if lock acquired successfully, False otherwise - """ - tx = self.get_transaction(transaction_id) - if not tx: - logger.error(f"Transaction not found: {transaction_id}") - return False - - tx.update_status(TransactionStatus.ACQUIRE) - effective_timeout = timeout if timeout is not None else self._lock_timeout - success = await self._path_lock.acquire_mv( - src_path, dst_path, tx, timeout=effective_timeout, src_is_dir=src_is_dir - ) - - if success: - tx.update_status(TransactionStatus.EXEC) - else: - tx.update_status(TransactionStatus.FAIL) - - return success - - def get_active_transactions(self) -> Dict[str, TransactionRecord]: - """Get all active transactions. - - Returns: - Dictionary of active transactions {transaction_id: TransactionRecord} - """ - return self._transactions.copy() - - def get_transaction_count(self) -> int: - """Get the number of active transactions. - - Returns: - Number of active transactions - """ - return len(self._transactions) - - -def init_transaction_manager( - agfs: AGFSClient, - tx_timeout: int = 3600, - max_parallel_locks: int = 8, - lock_timeout: float = 0.0, - lock_expire: float = 300.0, - vector_store: Optional[Any] = None, -) -> TransactionManager: - """Initialize transaction manager singleton. - - Args: - agfs: AGFS client instance - tx_timeout: Transaction timeout in seconds (default: 3600) - max_parallel_locks: Maximum number of parallel lock operations (default: 8) - lock_timeout: Path lock acquisition timeout in seconds. - 0 (default) = fail immediately if locked. - > 0 = wait/retry up to this many seconds. - lock_expire: Stale lock expiry threshold in seconds (default: 300s). - vector_store: Optional vector store for VectorDB rollback operations. - - Returns: - TransactionManager instance - """ - global _transaction_manager - - with _lock: - if _transaction_manager is not None: - logger.debug("TransactionManager already initialized") - return _transaction_manager - - # Create transaction manager - _transaction_manager = TransactionManager( - agfs_client=agfs, - timeout=tx_timeout, - max_parallel_locks=max_parallel_locks, - lock_timeout=lock_timeout, - lock_expire=lock_expire, - vector_store=vector_store, - ) - - logger.info("TransactionManager initialized as singleton") - return _transaction_manager - - -def get_transaction_manager() -> TransactionManager: - """Get transaction manager singleton.""" - if _transaction_manager is None: - raise RuntimeError( - "TransactionManager not initialized. Call init_transaction_manager() first." - ) - return _transaction_manager - - -def reset_transaction_manager() -> None: - """Reset the transaction manager singleton (for testing). - - This function should ONLY be used in tests to clean up state between tests. - It clears the global singleton instance without performing cleanup - make sure - to call stop() first if the manager is still running. - """ - global _transaction_manager - with _lock: - _transaction_manager = None diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py deleted file mode 100644 index b9eb0656..00000000 --- a/openviking/storage/transaction/transaction_record.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Transaction record and status definitions. - -Defines the data structures for tracking transaction lifecycle and state. -""" - -import time -import uuid -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Dict, List - - -class TransactionStatus(str, Enum): - """Transaction status enumeration. - - Status machine: INIT -> ACQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED - """ - - INIT = "INIT" # Transaction initialized, waiting for lock acquisition - ACQUIRE = "ACQUIRE" # Acquiring lock resources - EXEC = "EXEC" # Transaction operation in progress - COMMIT = "COMMIT" # Transaction completed successfully - FAIL = "FAIL" # Transaction failed - RELEASING = "RELEASING" # Releasing lock resources - RELEASED = "RELEASED" # Lock resources fully released, transaction ended - - def __str__(self) -> str: - return self.value - - -@dataclass -class TransactionRecord: - """Transaction record for tracking transaction lifecycle. - - Attributes: - id: Transaction ID in UUID format, uniquely identifies a transaction - locks: List of lock paths held by this transaction - status: Current transaction status - init_info: Transaction initialization information - rollback_info: Information for rollback operations - undo_log: List of undo entries for rollback - post_actions: Actions to execute after successful commit - created_at: Creation timestamp (Unix timestamp in seconds) - updated_at: Last update timestamp (Unix timestamp in seconds) - """ - - id: str = field(default_factory=lambda: str(uuid.uuid4())) - locks: List[str] = field(default_factory=list) - status: TransactionStatus = field(default=TransactionStatus.INIT) - init_info: Dict[str, Any] = field(default_factory=dict) - rollback_info: Dict[str, Any] = field(default_factory=dict) - undo_log: List[Any] = field(default_factory=list) - post_actions: List[Dict[str, Any]] = field(default_factory=list) - created_at: float = field(default_factory=time.time) - updated_at: float = field(default_factory=time.time) - - def update_status(self, status: TransactionStatus) -> None: - """Update transaction status and timestamp.""" - self.status = status - self.updated_at = time.time() - - def add_lock(self, lock_path: str) -> None: - """Add a lock to the transaction.""" - if lock_path not in self.locks: - self.locks.append(lock_path) - self.updated_at = time.time() - - def remove_lock(self, lock_path: str) -> None: - """Remove a lock from the transaction.""" - if lock_path in self.locks: - self.locks.remove(lock_path) - self.updated_at = time.time() - - def to_dict(self) -> Dict[str, Any]: - """Convert transaction record to dictionary.""" - return { - "id": self.id, - "locks": self.locks, - "status": str(self.status), - "init_info": self.init_info, - "rollback_info": self.rollback_info, - "created_at": self.created_at, - "updated_at": self.updated_at, - } - - def to_journal(self) -> Dict[str, Any]: - """Serialize to journal format (includes undo_log and post_actions).""" - from openviking.storage.transaction.undo import UndoEntry - - return { - "id": self.id, - "locks": self.locks, - "status": str(self.status), - "init_info": self.init_info, - "undo_log": [e.to_dict() if isinstance(e, UndoEntry) else e for e in self.undo_log], - "post_actions": self.post_actions, - "created_at": self.created_at, - "updated_at": self.updated_at, - } - - @classmethod - def from_journal(cls, data: Dict[str, Any]) -> "TransactionRecord": - """Restore from journal format.""" - from openviking.storage.transaction.undo import UndoEntry - - status_str = data.get("status", "INIT") - status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str - undo_log = [UndoEntry.from_dict(e) for e in data.get("undo_log", [])] - - return cls( - id=data.get("id", str(uuid.uuid4())), - locks=data.get("locks", []), - status=status, - init_info=data.get("init_info", {}), - rollback_info={}, - undo_log=undo_log, - post_actions=data.get("post_actions", []), - created_at=data.get("created_at", time.time()), - updated_at=data.get("updated_at", time.time()), - ) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord": - """Create transaction record from dictionary.""" - status_str = data.get("status", "INIT") - status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str - - return cls( - id=data.get("id", str(uuid.uuid4())), - locks=data.get("locks", []), - status=status, - init_info=data.get("init_info", {}), - rollback_info=data.get("rollback_info", {}), - created_at=data.get("created_at", time.time()), - updated_at=data.get("updated_at", time.time()), - ) diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py deleted file mode 100644 index 0b5b3113..00000000 --- a/openviking/storage/transaction/undo.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Undo log and rollback executor for transaction management. - -Records operations performed within a transaction so they can be reversed -on rollback. Each UndoEntry captures one atomic sub-operation. -""" - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -def _reconstruct_ctx(params: Dict[str, Any]) -> Optional[Any]: - """Reconstruct a RequestContext from serialized _ctx_* fields in undo params. - - Returns None if the required fields are missing. - """ - account_id = params.get("_ctx_account_id") - user_id = params.get("_ctx_user_id") - agent_id = params.get("_ctx_agent_id") - role_value = params.get("_ctx_role") - if account_id is None or user_id is None: - return None - try: - from openviking.server.identity import RequestContext, Role - from openviking_cli.session.user_id import UserIdentifier - - role = Role(role_value) if role_value in {r.value for r in Role} else Role.ROOT - user = UserIdentifier(account_id, user_id, agent_id or "default") - return RequestContext(user=user, role=role) - except Exception as e: - logger.warning(f"[Rollback] Failed to reconstruct ctx: {e}") - return None - - -@dataclass -class UndoEntry: - """A single undo log entry representing one reversible sub-operation. - - Attributes: - sequence: Monotonically increasing index within the transaction. - op_type: Operation type (fs_mv, fs_rm, fs_mkdir, fs_write_new, - vectordb_upsert, vectordb_delete, vectordb_update_uri). - params: Parameters needed to reverse the operation. - completed: Whether the forward operation completed successfully. - """ - - sequence: int - op_type: str - params: Dict[str, Any] = field(default_factory=dict) - completed: bool = False - - def to_dict(self) -> Dict[str, Any]: - return { - "sequence": self.sequence, - "op_type": self.op_type, - "params": self.params, - "completed": self.completed, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "UndoEntry": - return cls( - sequence=data.get("sequence", 0), - op_type=data.get("op_type", ""), - params=data.get("params", {}), - completed=data.get("completed", False), - ) - - -async def execute_rollback( - undo_log: List[UndoEntry], - agfs: Any, - vector_store: Optional[Any] = None, - ctx: Optional[Any] = None, - recover_all: bool = False, -) -> None: - """Execute rollback by reversing operations in reverse order. - - Best-effort: each step is wrapped in try-except so a single failure - does not prevent subsequent undo steps from running. - - Args: - undo_log: List of undo entries to process. - agfs: AGFS client for filesystem operations. - vector_store: Optional vector store client. - ctx: Optional request context. - recover_all: If True, also attempt to reverse entries that were not - marked completed (used during crash recovery to clean up partial - operations such as a directory mv that only half-finished). - """ - if recover_all: - entries = list(undo_log) - else: - entries = [e for e in undo_log if e.completed] - entries.sort(key=lambda e: e.sequence, reverse=True) - - for entry in entries: - try: - await _rollback_entry(entry, agfs, vector_store, ctx) - logger.info(f"[Rollback] Reversed {entry.op_type} seq={entry.sequence}") - except Exception as e: - logger.warning( - f"[Rollback] Failed to reverse {entry.op_type} seq={entry.sequence}: {e}" - ) - - -async def _rollback_entry( - entry: UndoEntry, - agfs: Any, - vector_store: Optional[Any], - ctx: Optional[Any], -) -> None: - """Dispatch rollback for a single undo entry.""" - op = entry.op_type - params = entry.params - - if op == "fs_mv": - agfs.mv(params["dst"], params["src"]) - - elif op == "fs_rm": - logger.debug("[Rollback] fs_rm is not reversible, skipping") - - elif op == "fs_mkdir": - try: - agfs.rm(params["uri"]) - except Exception: - pass - - elif op == "fs_write_new": - try: - agfs.rm(params["uri"], recursive=True) - except Exception: - pass - - elif op == "vectordb_upsert": - if vector_store: - record_id = params.get("record_id") - if record_id: - restored_ctx = _reconstruct_ctx(params) - if restored_ctx: - await vector_store.delete([record_id], ctx=restored_ctx) - else: - logger.warning("[Rollback] vectordb_upsert: cannot reconstruct ctx, skipping") - - elif op == "vectordb_delete": - if vector_store: - restored_ctx = _reconstruct_ctx(params) - if restored_ctx is None: - logger.warning("[Rollback] vectordb_delete: cannot reconstruct ctx, skipping") - else: - records_snapshot = params.get("records_snapshot", []) - for record in records_snapshot: - try: - await vector_store.upsert(record, ctx=restored_ctx) - except Exception as e: - logger.warning(f"[Rollback] Failed to restore vector record: {e}") - - elif op == "vectordb_update_uri": - if vector_store: - restored_ctx = _reconstruct_ctx(params) - if restored_ctx is None: - logger.warning("[Rollback] vectordb_update_uri: cannot reconstruct ctx, skipping") - else: - await vector_store.update_uri_mapping( - ctx=restored_ctx, - uri=params["new_uri"], - new_uri=params["old_uri"], - new_parent_uri=params.get("old_parent_uri", ""), - ) - - else: - logger.warning(f"[Rollback] Unknown op_type: {op}") diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index dc20acd1..72475573 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -289,17 +289,14 @@ async def rm( This method is idempotent: deleting a non-existent file succeeds after cleaning up any orphan index records. - Wrapped in a transaction: deletes VectorDB records first, then FS files. - On rollback, VectorDB records are restored from snapshot. + Acquires a path lock, deletes VectorDB records, then FS files. """ - from openviking.storage.transaction import TransactionContext, get_transaction_manager + from openviking.storage.transaction import LockContext, get_lock_manager self._ensure_access(uri, ctx) path = self._uri_to_path(uri, ctx=ctx) target_uri = self._path_to_uri(path, ctx=ctx) - tx_manager = get_transaction_manager() - # Check existence and determine lock strategy try: stat = self.agfs.stat(path) @@ -320,36 +317,11 @@ async def rm( lock_paths = [parent] lock_mode = "point" - async with TransactionContext(tx_manager, "rm", lock_paths, lock_mode=lock_mode) as tx: - # Collect URIs inside the lock to avoid race conditions + async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode): uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) uris_to_delete.append(target_uri) - - # Snapshot vector records for rollback - records_snapshot = await self._snapshot_vector_records(uris_to_delete, ctx=ctx) - - # Step 1: Delete from VectorDB first - real_ctx = self._ctx_or_default(ctx) - seq_vdb = tx.record_undo( - "vectordb_delete", - { - "uris": uris_to_delete, - "records_snapshot": records_snapshot, - "_ctx_account_id": real_ctx.account_id, - "_ctx_user_id": real_ctx.user.user_id, - "_ctx_agent_id": real_ctx.user.agent_id, - "_ctx_role": real_ctx.role.value, - }, - ) await self._delete_from_vector_store(uris_to_delete, ctx=ctx) - tx.mark_completed(seq_vdb) - - # Step 2: Delete from FS - seq_fs = tx.record_undo("fs_rm", {"uri": path, "recursive": recursive}) result = self.agfs.rm(path, recursive=recursive) - tx.mark_completed(seq_fs) - - await tx.commit() return result async def mv( @@ -361,10 +333,10 @@ async def mv( """Move file/directory + recursively update vector index. Implemented as cp + rm to avoid lock files being carried by FS mv. - On rollback, the copy is deleted and the source remains intact. + On VectorDB update failure the copy is cleaned up so the source stays intact. """ from openviking.pyagfs.helpers import cp as agfs_cp - from openviking.storage.transaction import TransactionContext, get_transaction_manager + from openviking.storage.transaction import LockContext, get_lock_manager self._ensure_access(old_uri, ctx) self._ensure_access(new_uri, ctx) @@ -372,8 +344,6 @@ async def mv( new_path = self._uri_to_path(new_uri, ctx=ctx) target_uri = self._path_to_uri(old_path, ctx=ctx) - tx_manager = get_transaction_manager() - # Verify source exists and determine type before locking try: stat = self.agfs.stat(old_path) @@ -383,20 +353,17 @@ async def mv( dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path - async with TransactionContext( - tx_manager, - "mv", + async with LockContext( + get_lock_manager(), [old_path], lock_mode="mv", mv_dst_path=dst_parent, src_is_dir=is_dir, - ) as tx: - # Collect URIs inside the lock to avoid race conditions + ): uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) uris_to_move.append(target_uri) - # Step 1: Copy source to destination - seq_cp = tx.record_undo("fs_write_new", {"uri": new_path}) + # Copy source to destination (source still intact) try: agfs_cp(self.agfs, old_path, new_path, recursive=is_dir) except Exception as e: @@ -404,9 +371,8 @@ async def mv( await self._delete_from_vector_store(uris_to_move, ctx=ctx) logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}") raise - tx.mark_completed(seq_cp) - # Step 2: Remove carried lock file from the copy (directory only) + # Remove carried lock file from the copy (directory only) if is_dir: carried_lock = new_path.rstrip("/") + "/.path.ovlock" try: @@ -414,34 +380,18 @@ async def mv( except Exception: pass - # Step 3: Update VectorDB URIs - old_uri_stripped = old_uri.rstrip("/") - old_parent_uri = ( - old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else "" - ) - real_ctx = self._ctx_or_default(ctx) - seq_vdb = tx.record_undo( - "vectordb_update_uri", - { - "old_uri": old_uri, - "new_uri": new_uri, - "old_parent_uri": old_parent_uri, - "uris": uris_to_move, - "_ctx_account_id": real_ctx.account_id, - "_ctx_user_id": real_ctx.user.user_id, - "_ctx_agent_id": real_ctx.user.agent_id, - "_ctx_role": real_ctx.role.value, - }, - ) - await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) - tx.mark_completed(seq_vdb) + # Update VectorDB URIs (on failure, clean up the copy) + try: + await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx) + except Exception: + try: + self.agfs.rm(new_path, recursive=is_dir) + except Exception: + pass + raise - # Step 4: Remove source (lock file gets deleted along with it) - seq_rm = tx.record_undo("fs_rm", {"uri": old_path, "recursive": is_dir}) + # Delete source self.agfs.rm(old_path, recursive=is_dir) - tx.mark_completed(seq_rm) - - await tx.commit() return {} async def grep( @@ -1131,7 +1081,7 @@ def _is_accessible(self, uri: str, ctx: RequestContext) -> bool: return True scope = parts[0] - if scope in {"resources", "temp", "transactions"}: + if scope in {"resources", "temp"}: return True if scope == "_system": return False @@ -1206,33 +1156,6 @@ def _infer_context_type(self, uri: str): # ========== Vector Sync Helper Methods ========== - async def _snapshot_vector_records( - self, uris: List[str], ctx: Optional[RequestContext] = None - ) -> List[Dict[str, Any]]: - """Snapshot vector records for the given URIs (for rollback). - - Queries VectorDB metadata (without embedding vectors) so that - records can be restored during rollback. - """ - vector_store = self._get_vector_store() - if not vector_store: - return [] - - real_ctx = self._ctx_or_default(ctx) - snapshots = [] - for uri in uris: - try: - records = await vector_store.get_context_by_uri( - uri=uri, - limit=10, - ctx=real_ctx, - ) - if records: - snapshots.extend(records) - except Exception as e: - logger.debug(f"[VikingFS] Failed to snapshot vector record for {uri}: {e}") - return snapshots - async def _collect_uris( self, path: str, recursive: bool, ctx: Optional[RequestContext] = None ) -> List[str]: diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index d6c44194..42ca8752 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -218,11 +218,8 @@ async def process_resource( viking_fs = get_viking_fs() target_exists = await viking_fs.exists(root_uri, ctx=ctx) if not target_exists: - # 第一次添加:事务保护下将 temp 移到 final - from openviking.storage.transaction import ( - TransactionContext, - get_transaction_manager, - ) + # 第一次添加:锁保护下将 temp 移到 final + from openviking.storage.transaction import LockContext, get_lock_manager dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx) parent_path = dst_path.rsplit("/", 1)[0] if "/" in dst_path else dst_path @@ -232,17 +229,9 @@ async def process_resource( if parent_uri: await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx) - async with TransactionContext( - get_transaction_manager(), - "finalize_from_temp", - [parent_path], - lock_mode="point", - ) as tx: - seq = tx.record_undo("fs_write_new", {"uri": dst_path}) + async with LockContext(get_lock_manager(), [parent_path], lock_mode="point"): src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx) await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path) - tx.mark_completed(seq) - await tx.commit() # 清理 temp 根目录 try: diff --git a/openviking_cli/utils/config/transaction_config.py b/openviking_cli/utils/config/transaction_config.py index fac8c2aa..86d153f8 100644 --- a/openviking_cli/utils/config/transaction_config.py +++ b/openviking_cli/utils/config/transaction_config.py @@ -29,9 +29,4 @@ class TransactionConfig(BaseModel): ), ) - max_parallel_locks: int = Field( - default=8, - description="Maximum parallel lock operations during recursive rm/mv.", - ) - model_config = {"extra": "forbid"} diff --git a/tests/agfs/test_fs_binding.py b/tests/agfs/test_fs_binding.py index e55ff6fd..3e76ee8f 100644 --- a/tests/agfs/test_fs_binding.py +++ b/tests/agfs/test_fs_binding.py @@ -13,7 +13,7 @@ import pytest -from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -33,15 +33,15 @@ async def viking_fs_binding_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize TransactionManager and VikingFS with client - init_transaction_manager(agfs=agfs_client) + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) # make sure default/temp directory exists await vfs.mkdir("viking://temp/", exist_ok=True) yield vfs - reset_transaction_manager() + reset_lock_manager() @pytest.mark.asyncio diff --git a/tests/agfs/test_fs_binding_s3.py b/tests/agfs/test_fs_binding_s3.py index aa7a753b..802d4f6d 100644 --- a/tests/agfs/test_fs_binding_s3.py +++ b/tests/agfs/test_fs_binding_s3.py @@ -13,7 +13,7 @@ import pytest -from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -58,13 +58,13 @@ async def viking_fs_binding_s3_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize TransactionManager and VikingFS with client - init_transaction_manager(agfs=agfs_client) + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) yield vfs - reset_transaction_manager() + reset_lock_manager() @pytest.mark.asyncio diff --git a/tests/agfs/test_fs_local.py b/tests/agfs/test_fs_local.py index 9e59f610..41ef0730 100644 --- a/tests/agfs/test_fs_local.py +++ b/tests/agfs/test_fs_local.py @@ -10,7 +10,7 @@ import pytest from openviking.agfs_manager import AGFSManager -from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -40,15 +40,15 @@ async def viking_fs_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize TransactionManager and VikingFS with client - init_transaction_manager(agfs=agfs_client) + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) # make sure default/temp directory exists await vfs.mkdir("viking://temp/", exist_ok=True) yield vfs - reset_transaction_manager() + reset_lock_manager() # AGFSManager.stop is synchronous manager.stop() diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py index 67a54e40..00504fad 100644 --- a/tests/agfs/test_fs_s3.py +++ b/tests/agfs/test_fs_s3.py @@ -13,7 +13,7 @@ import pytest from openviking.agfs_manager import AGFSManager -from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager +from openviking.storage.transaction import init_lock_manager, reset_lock_manager from openviking.storage.viking_fs import VikingFS, init_viking_fs from openviking_cli.utils.config.agfs_config import AGFSConfig @@ -83,13 +83,13 @@ async def viking_fs_instance(): # Create AGFS client agfs_client = create_agfs_client(AGFS_CONF) - # Initialize TransactionManager and VikingFS with client - init_transaction_manager(agfs=agfs_client) + # Initialize LockManager and VikingFS with client + init_lock_manager(agfs=agfs_client) vfs = init_viking_fs(agfs=agfs_client) yield vfs - reset_transaction_manager() + reset_lock_manager() # AGFSManager.stop is synchronous manager.stop() diff --git a/tests/server/conftest.py b/tests/server/conftest.py index 78dbb63e..98cf606f 100644 --- a/tests/server/conftest.py +++ b/tests/server/conftest.py @@ -20,7 +20,7 @@ from openviking.server.config import ServerConfig from openviking.server.identity import RequestContext, Role from openviking.service.core import OpenVikingService -from openviking.storage.transaction import reset_transaction_manager +from openviking.storage.transaction import reset_lock_manager from openviking_cli.session.user_id import UserIdentifier from openviking_cli.utils.config.embedding_config import EmbeddingConfig from openviking_cli.utils.config.vlm_config import VLMConfig @@ -110,7 +110,7 @@ def sample_markdown_file(temp_dir: Path) -> Path: @pytest_asyncio.fixture(scope="function") async def service(temp_dir: Path, monkeypatch): """Create and initialize an OpenVikingService in embedded mode.""" - reset_transaction_manager() + reset_lock_manager() fake_embedder_cls = _install_fake_embedder(monkeypatch) _install_fake_vlm(monkeypatch) svc = OpenVikingService( @@ -120,7 +120,7 @@ async def service(temp_dir: Path, monkeypatch): svc.viking_fs.query_embedder = fake_embedder_cls() yield svc await svc.close() - reset_transaction_manager() + reset_lock_manager() @pytest_asyncio.fixture(scope="function") @@ -165,7 +165,7 @@ async def client_with_resource(client, service, sample_markdown_file): async def running_server(temp_dir: Path, monkeypatch): """Start a real uvicorn server in a background thread.""" await AsyncOpenViking.reset() - reset_transaction_manager() + reset_lock_manager() fake_embedder_cls = _install_fake_embedder(monkeypatch) _install_fake_vlm(monkeypatch) diff --git a/tests/storage/test_semantic_dag_skip_files.py b/tests/storage/test_semantic_dag_skip_files.py index 6fdf30ee..3eaeaa2f 100644 --- a/tests/storage/test_semantic_dag_skip_files.py +++ b/tests/storage/test_semantic_dag_skip_files.py @@ -11,19 +11,18 @@ def _mock_transaction_layer(monkeypatch): - """Patch transaction layer to no-op for DAG tests.""" - mock_tx = MagicMock() - mock_tx.commit = AsyncMock() + """Patch lock layer to no-op for DAG tests.""" + mock_handle = MagicMock() monkeypatch.setattr( - "openviking.storage.transaction.context_manager.TransactionContext.__aenter__", - AsyncMock(return_value=mock_tx), + "openviking.storage.transaction.lock_context.LockContext.__aenter__", + AsyncMock(return_value=mock_handle), ) monkeypatch.setattr( - "openviking.storage.transaction.context_manager.TransactionContext.__aexit__", + "openviking.storage.transaction.lock_context.LockContext.__aexit__", AsyncMock(return_value=False), ) monkeypatch.setattr( - "openviking.storage.transaction.get_transaction_manager", + "openviking.storage.transaction.get_lock_manager", lambda: MagicMock(), ) @@ -58,6 +57,9 @@ async def _generate_overview(self, dir_uri, file_summaries, children_abstracts): def _extract_abstract_from_overview(self, overview): return "abstract" + def _enforce_size_limits(self, overview, abstract): + return overview, abstract + async def _vectorize_directory( self, uri, context_type, abstract, overview, ctx=None, semantic_msg_id=None ): diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py index 23dde041..94f9441f 100644 --- a/tests/storage/test_semantic_dag_stats.py +++ b/tests/storage/test_semantic_dag_stats.py @@ -40,6 +40,9 @@ async def _generate_overview(self, dir_uri, file_summaries, children_abstracts): def _extract_abstract_from_overview(self, overview): return "abstract" + def _enforce_size_limits(self, overview, abstract): + return overview, abstract + async def _vectorize_directory( self, uri, context_type, abstract, overview, ctx=None, semantic_msg_id=None ): @@ -79,19 +82,18 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch): lambda: _DummyTracker(), ) - # Mock transaction layer: TransactionContext as no-op passthrough - mock_tx = MagicMock() - mock_tx.commit = AsyncMock() + # Mock lock layer: LockContext as no-op passthrough + mock_handle = MagicMock() monkeypatch.setattr( - "openviking.storage.transaction.context_manager.TransactionContext.__aenter__", - AsyncMock(return_value=mock_tx), + "openviking.storage.transaction.lock_context.LockContext.__aenter__", + AsyncMock(return_value=mock_handle), ) monkeypatch.setattr( - "openviking.storage.transaction.context_manager.TransactionContext.__aexit__", + "openviking.storage.transaction.lock_context.LockContext.__aexit__", AsyncMock(return_value=False), ) monkeypatch.setattr( - "openviking.storage.transaction.get_transaction_manager", + "openviking.storage.transaction.get_lock_manager", lambda: MagicMock(), ) diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py index 05fac402..a0952289 100644 --- a/tests/transaction/conftest.py +++ b/tests/transaction/conftest.py @@ -11,9 +11,9 @@ from openviking.agfs_manager import AGFSManager from openviking.server.identity import RequestContext, Role from openviking.storage.collection_schemas import CollectionSchemas -from openviking.storage.transaction.journal import TransactionJournal +from openviking.storage.transaction.lock_manager import LockManager from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _make_fencing_token -from openviking.storage.transaction.transaction_manager import TransactionManager +from openviking.storage.transaction.redo_log import RedoLog from openviking.storage.viking_vector_index_backend import VikingVectorIndexBackend from openviking.utils.agfs_utils import create_agfs_client from openviking_cli.session.user_id import UserIdentifier @@ -55,7 +55,6 @@ def _mkdir_ok(agfs_client, path): @pytest.fixture def test_dir(agfs_client): - """每个测试独享隔离目录,自动清理。""" path = f"/local/tx-tests/{uuid.uuid4().hex}" _mkdir_ok(agfs_client, "/local") _mkdir_ok(agfs_client, "/local/tx-tests") @@ -102,20 +101,20 @@ def request_ctx(): # --------------------------------------------------------------------------- -# Transaction fixtures +# Lock fixtures # --------------------------------------------------------------------------- @pytest.fixture -def tx_manager(agfs_client, vector_store): - """Function-scoped TransactionManager with real backends.""" - return TransactionManager(agfs_client=agfs_client, vector_store=vector_store) +def lock_manager(agfs_client): + """Function-scoped LockManager with real AGFS backend.""" + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) @pytest.fixture -def journal(agfs_client): - """Function-scoped TransactionJournal with real AGFS backend.""" - return TransactionJournal(agfs_client) +def redo_log(agfs_client): + """Function-scoped RedoLog with real AGFS backend.""" + return RedoLog(agfs_client) # --------------------------------------------------------------------------- diff --git a/tests/transaction/test_concurrent_lock.py b/tests/transaction/test_concurrent_lock.py index e98279e4..7e25ab57 100644 --- a/tests/transaction/test_concurrent_lock.py +++ b/tests/transaction/test_concurrent_lock.py @@ -5,8 +5,8 @@ import asyncio import uuid +from openviking.storage.transaction.lock_handle import LockHandle from openviking.storage.transaction.path_lock import PathLock -from openviking.storage.transaction.transaction_record import TransactionRecord class TestConcurrentLock: @@ -17,7 +17,7 @@ async def test_point_mutual_exclusion_same_path(self, agfs_client, test_dir): results = {} async def holder(tx_id): - tx = TransactionRecord(id=tx_id) + tx = LockHandle(id=tx_id) ok = await lock.acquire_point(test_dir, tx, timeout=5.0) if ok: await asyncio.sleep(0.3) @@ -45,7 +45,7 @@ async def test_subtree_blocks_concurrent_point_child(self, agfs_client, test_dir child_result = {} async def parent_holder(): - tx = TransactionRecord(id="tx-sub-parent") + tx = LockHandle(id="tx-sub-parent") ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0) assert ok is True parent_acquired.set() @@ -55,7 +55,7 @@ async def parent_holder(): async def child_worker(): await parent_acquired.wait() - tx = TransactionRecord(id="tx-sub-child") + tx = LockHandle(id="tx-sub-child") ok = await lock.acquire_point(child, tx, timeout=5.0) child_result["ok"] = ok child_result["after_release"] = parent_released.is_set() @@ -80,7 +80,7 @@ async def test_point_child_blocks_concurrent_subtree_parent(self, agfs_client, t parent_result = {} async def child_holder(): - tx = TransactionRecord(id="tx-rev-child") + tx = LockHandle(id="tx-rev-child") ok = await lock.acquire_point(child, tx, timeout=5.0) assert ok is True child_acquired.set() @@ -90,7 +90,7 @@ async def child_holder(): async def parent_worker(): await child_acquired.wait() - tx = TransactionRecord(id="tx-rev-parent") + tx = LockHandle(id="tx-rev-parent") ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0) parent_result["ok"] = ok parent_result["after_release"] = child_released.is_set() diff --git a/tests/transaction/test_context_manager.py b/tests/transaction/test_context_manager.py deleted file mode 100644 index bf077bf9..00000000 --- a/tests/transaction/test_context_manager.py +++ /dev/null @@ -1,226 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Tests for TransactionContext.""" - -from unittest.mock import AsyncMock, MagicMock - -import pytest - -from openviking.storage.errors import LockAcquisitionError -from openviking.storage.transaction.context_manager import TransactionContext -from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus - - -def _make_tx_manager(lock_succeeds=True): - """Create a mock TransactionManager with async methods.""" - tx_manager = MagicMock() - record = TransactionRecord(id="tx-test", status=TransactionStatus.INIT) - - tx_manager.create_transaction.return_value = record - tx_manager.acquire_lock_point = AsyncMock(return_value=lock_succeeds) - tx_manager.acquire_lock_subtree = AsyncMock(return_value=lock_succeeds) - tx_manager.acquire_lock_mv = AsyncMock(return_value=lock_succeeds) - tx_manager.commit = AsyncMock(return_value=True) - tx_manager.rollback = AsyncMock(return_value=True) - - journal = MagicMock() - tx_manager.journal = journal - - return tx_manager, record - - -class TestTransactionContextNormal: - async def test_commit_success(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx: - seq = tx.record_undo("fs_write_new", {"uri": "/path/file"}) - tx.mark_completed(seq) - await tx.commit() - - tx_manager.commit.assert_called_once_with("tx-test") - tx_manager.rollback.assert_not_called() - - async def test_rollback_on_exception(self): - tx_manager, record = _make_tx_manager() - - with pytest.raises(ValueError): - async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx: - seq = tx.record_undo("fs_write_new", {"uri": "/path/file"}) - tx.mark_completed(seq) - raise ValueError("something went wrong") - - tx_manager.rollback.assert_called_once_with("tx-test") - tx_manager.commit.assert_not_called() - - async def test_rollback_on_no_commit(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx: - tx.record_undo("fs_write_new", {"uri": "/path/file"}) - # Forgot to call tx.commit() - - tx_manager.rollback.assert_called_once_with("tx-test") - - async def test_lock_failure_raises(self): - tx_manager, record = _make_tx_manager(lock_succeeds=False) - - with pytest.raises(LockAcquisitionError): - async with TransactionContext(tx_manager, "test_op", ["/path"]) as _tx: - pass - - -class TestTransactionContextLockModes: - async def test_subtree_lock_mode(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "rm_op", ["/path"], lock_mode="subtree") as tx: - await tx.commit() - - tx_manager.acquire_lock_subtree.assert_called_once() - - async def test_mv_lock_mode(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext( - tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path="/dst" - ) as tx: - await tx.commit() - - tx_manager.acquire_lock_mv.assert_called_once_with( - "tx-test", "/src", "/dst", src_is_dir=True - ) - - async def test_point_lock_mode(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "write_op", ["/path"], lock_mode="point") as tx: - await tx.commit() - - tx_manager.acquire_lock_point.assert_called_once() - - -class TestTransactionContextUndoLog: - async def test_undo_entries_tracked(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "test", ["/path"]) as tx: - s0 = tx.record_undo("fs_mkdir", {"uri": "/a"}) - s1 = tx.record_undo("fs_write_new", {"uri": "/a/f.txt"}) - tx.mark_completed(s0) - tx.mark_completed(s1) - await tx.commit() - - assert len(record.undo_log) == 2 - assert record.undo_log[0].completed is True - assert record.undo_log[1].completed is True - - -class TestTransactionContextPostActions: - async def test_post_actions_added(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "test", ["/path"]) as tx: - tx.add_post_action("enqueue_semantic", {"uri": "viking://test"}) - await tx.commit() - - assert len(record.post_actions) == 1 - assert record.post_actions[0]["type"] == "enqueue_semantic" - - -class TestTransactionContextEdgeCases: - async def test_commit_failure_raises_transaction_error(self): - """When TransactionManager.commit() returns False, TransactionError is raised.""" - from openviking.storage.errors import TransactionError - - tx_manager, record = _make_tx_manager() - tx_manager.commit = AsyncMock(return_value=False) - - with pytest.raises(TransactionError, match="Failed to commit"): - async with TransactionContext(tx_manager, "test", ["/path"]) as tx: - await tx.commit() - - async def test_mv_mode_missing_dst_raises(self): - """mv lock mode without mv_dst_path raises TransactionError.""" - from openviking.storage.errors import TransactionError - - tx_manager, record = _make_tx_manager() - - with pytest.raises(TransactionError, match="mv lock mode requires"): - async with TransactionContext( - tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path=None - ) as _tx: - pass - - async def test_mark_completed_nonexistent_sequence_is_noop(self): - """mark_completed with a sequence not in undo_log doesn't crash.""" - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "test", ["/path"]) as tx: - seq = tx.record_undo("fs_mkdir", {"uri": "/a"}) - tx.mark_completed(999) # Nonexistent sequence - # Original entry should remain unmarked - assert record.undo_log[0].completed is False - tx.mark_completed(seq) - assert record.undo_log[0].completed is True - await tx.commit() - - async def test_journal_update_failure_does_not_break_transaction(self): - """Journal update failures during record_undo/mark_completed are silently ignored.""" - tx_manager, record = _make_tx_manager() - tx_manager.journal.update.side_effect = Exception("disk full") - - # Should not raise despite journal failures - async with TransactionContext(tx_manager, "test", ["/path"]) as tx: - seq = tx.record_undo("fs_mkdir", {"uri": "/a"}) - tx.mark_completed(seq) - await tx.commit() - - assert len(record.undo_log) == 1 - assert record.undo_log[0].completed is True - - async def test_record_property_before_enter_raises(self): - """Accessing tx.record before __aenter__ raises TransactionError.""" - from openviking.storage.errors import TransactionError - - tx_manager, _ = _make_tx_manager() - ctx = TransactionContext(tx_manager, "test", ["/path"]) - - with pytest.raises(TransactionError, match="Transaction not started"): - _ = ctx.record - - async def test_multiple_undo_entries_sequence_increments(self): - tx_manager, record = _make_tx_manager() - - async with TransactionContext(tx_manager, "test", ["/path"]) as tx: - s0 = tx.record_undo("fs_mkdir", {"uri": "/a"}) - s1 = tx.record_undo("fs_write_new", {"uri": "/a/f"}) - s2 = tx.record_undo("fs_mv", {"src": "/a", "dst": "/b"}) - assert s0 == 0 - assert s1 == 1 - assert s2 == 2 - await tx.commit() - - async def test_multiple_lock_paths_point_mode(self): - """Multiple lock_paths in point mode: each path gets acquire_lock_point called.""" - tx_manager, record = _make_tx_manager() - - async with TransactionContext( - tx_manager, "multi", ["/path1", "/path2"], lock_mode="point" - ) as tx: - await tx.commit() - - assert tx_manager.acquire_lock_point.call_count == 2 - - async def test_subtree_multiple_paths_stops_on_first_failure(self): - """If acquiring subtree lock on first path fails, second path is not attempted.""" - tx_manager, record = _make_tx_manager(lock_succeeds=False) - - with pytest.raises(LockAcquisitionError): - async with TransactionContext( - tx_manager, "rm", ["/path1", "/path2"], lock_mode="subtree" - ) as _tx: - pass - - # Only called once (failed on first path) - assert tx_manager.acquire_lock_subtree.call_count == 1 diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py deleted file mode 100644 index 21569edd..00000000 --- a/tests/transaction/test_crash_recovery.py +++ /dev/null @@ -1,561 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Integration test: crash recovery from journal using real AGFS and VectorDB backends.""" - -import uuid -from unittest.mock import AsyncMock, patch - -from openviking.storage.transaction.journal import TransactionJournal -from openviking.storage.transaction.transaction_manager import TransactionManager -from openviking.storage.transaction.transaction_record import ( - TransactionRecord, - TransactionStatus, -) -from openviking.storage.transaction.undo import UndoEntry - -from .conftest import VECTOR_DIM, _mkdir_ok, file_exists, make_lock_file - - -def _write_journal(journal, record): - """Write a TransactionRecord to real journal storage.""" - journal.write(record.to_journal()) - - -class TestCrashRecovery: - """ - Core technique: simulate crash recovery. - - 1. Create real FS state via agfs_client - 2. Build TransactionRecord, write to real journal - 3. Create fresh TransactionManager (simulates process restart) - 4. Call manager._recover_pending_transactions() - 5. Verify final state via agfs_client.stat()/cat() and vector_store.get() - """ - - async def test_recover_commit_no_rollback(self, agfs_client, vector_store, test_dir): - """COMMIT status → committed files NOT rolled back, journal cleaned up.""" - # Create a file that was part of a committed transaction - committed_file = f"{test_dir}/committed.txt" - agfs_client.write(committed_file, b"committed data") - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-commit-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.COMMIT, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_write_new", - params={"uri": committed_file}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - # New manager (simulates restart) - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - # File should still exist (no rollback for committed tx) - assert file_exists(agfs_client, committed_file) - # Journal should be cleaned up - assert tx_id not in journal.list_all() - - async def test_recover_commit_replays_post_actions(self, agfs_client, vector_store, test_dir): - """COMMIT + post_actions → replay post_actions.""" - journal = TransactionJournal(agfs_client) - tx_id = f"tx-post-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.COMMIT, - locks=[], - undo_log=[], - post_actions=[ - { - "type": "enqueue_semantic", - "params": { - "uri": "viking://test-post", - "context_type": "resource", - "account_id": "acc", - }, - } - ], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - - with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post: - await manager._recover_pending_transactions() - - mock_post.assert_called_once() - assert tx_id not in journal.list_all() - - async def test_recover_exec_rollback_fs_mv(self, agfs_client, vector_store, test_dir): - """EXEC status with fs_mv → recovery rolls back → file moved back.""" - src = f"{test_dir}/exec-mv-src" - dst = f"{test_dir}/exec-mv-dst" - _mkdir_ok(agfs_client, src) - agfs_client.write(f"{src}/data.txt", b"mv-data") - - # Simulate: forward mv happened, then crash - agfs_client.mv(src, dst) - assert not file_exists(agfs_client, src) - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-exec-mv-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mv", - params={"src": src, "dst": dst}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert file_exists(agfs_client, src) - assert not file_exists(agfs_client, dst) - assert tx_id not in journal.list_all() - - async def test_recover_exec_rollback_fs_mkdir(self, agfs_client, vector_store, test_dir): - """EXEC with fs_mkdir → recovery → directory removed.""" - new_dir = f"{test_dir}/exec-mkdir" - _mkdir_ok(agfs_client, new_dir) - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-exec-mkdir-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": new_dir}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, new_dir) - assert tx_id not in journal.list_all() - - async def test_recover_exec_rollback_fs_write_new(self, agfs_client, vector_store, test_dir): - """EXEC with fs_write_new → recovery → file removed.""" - file_path = f"{test_dir}/exec-write.txt" - agfs_client.write(file_path, b"to-be-rolled-back") - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-exec-write-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_write_new", - params={"uri": file_path}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, file_path) - assert tx_id not in journal.list_all() - - async def test_recover_exec_rollback_vectordb_upsert( - self, agfs_client, vector_store, request_ctx, test_dir - ): - """EXEC with vectordb_upsert → recovery → record deleted from VectorDB.""" - record_id = str(uuid.uuid4()) - record = { - "id": record_id, - "uri": f"viking://resources/crash-upsert-{record_id}.md", - "parent_uri": "viking://resources/", - "account_id": "default", - "context_type": "resource", - "level": 2, - "vector": [0.5] * VECTOR_DIM, - "name": "crash-upsert", - "description": "test", - "abstract": "test", - } - await vector_store.upsert(record, ctx=request_ctx) - assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1 - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-exec-vdb-{uuid.uuid4().hex[:8]}" - tx_record = TransactionRecord( - id=tx_id, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="vectordb_upsert", - params={ - "record_id": record_id, - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, tx_record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - results = await vector_store.get([record_id], ctx=request_ctx) - assert len(results) == 0 - assert tx_id not in journal.list_all() - - async def test_recover_fail_triggers_rollback(self, agfs_client, vector_store, test_dir): - """FAIL status → also triggers rollback.""" - new_dir = f"{test_dir}/fail-dir" - _mkdir_ok(agfs_client, new_dir) - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-fail-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.FAIL, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": new_dir}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, new_dir) - assert tx_id not in journal.list_all() - - async def test_recover_releasing_triggers_rollback(self, agfs_client, vector_store, test_dir): - """RELEASING status → rollback + lock cleanup.""" - new_dir = f"{test_dir}/releasing-dir" - _mkdir_ok(agfs_client, new_dir) - - lock_path = make_lock_file(agfs_client, test_dir, "tx-releasing-placeholder", "S") - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-releasing-{uuid.uuid4().hex[:8]}" - # Rewrite lock with correct tx_id - lock_path = make_lock_file(agfs_client, test_dir, tx_id, "S") - - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.RELEASING, - locks=[lock_path], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": new_dir}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, new_dir) - assert not file_exists(agfs_client, lock_path) - assert tx_id not in journal.list_all() - - async def test_recover_exec_includes_incomplete(self, agfs_client, vector_store, test_dir): - """EXEC recovery uses recover_all=True → also reverses incomplete entries.""" - new_dir = f"{test_dir}/exec-incomplete" - _mkdir_ok(agfs_client, new_dir) - - journal = TransactionJournal(agfs_client) - tx_id = f"tx-exec-inc-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": new_dir}, - completed=False, # incomplete, but recover_all=True reverses it - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, new_dir) - assert tx_id not in journal.list_all() - - async def test_recover_init_cleans_locks(self, agfs_client, vector_store, test_dir): - """INIT status → no rollback, just lock cleanup + journal delete.""" - lock_dir = f"{test_dir}/init-lock-dir" - _mkdir_ok(agfs_client, lock_dir) - - tx_id = f"tx-init-{uuid.uuid4().hex[:8]}" - lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P") - - journal = TransactionJournal(agfs_client) - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.INIT, - locks=[lock_path], - undo_log=[], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, lock_path) - assert tx_id not in journal.list_all() - - async def test_recover_acquire_cleans_locks(self, agfs_client, vector_store, test_dir): - """ACQUIRE status → same as INIT, clean up only.""" - lock_dir = f"{test_dir}/acquire-lock-dir" - _mkdir_ok(agfs_client, lock_dir) - - tx_id = f"tx-acq-{uuid.uuid4().hex[:8]}" - lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P") - - journal = TransactionJournal(agfs_client) - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.ACQUIRE, - locks=[lock_path], - undo_log=[], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, lock_path) - assert tx_id not in journal.list_all() - - async def test_recover_init_orphan_lock_via_init_info( - self, agfs_client, vector_store, test_dir - ): - """INIT with empty locks but init_info.lock_paths → clean orphan lock owned by tx.""" - orphan_dir = f"{test_dir}/orphan-dir" - _mkdir_ok(agfs_client, orphan_dir) - - tx_id = f"tx-orphan-{uuid.uuid4().hex[:8]}" - lock_path = make_lock_file(agfs_client, orphan_dir, tx_id, "S") - - journal = TransactionJournal(agfs_client) - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.INIT, - locks=[], # Empty — crash happened before journal recorded locks - init_info={ - "operation": "rm", - "lock_paths": [orphan_dir], - "lock_mode": "subtree", - }, - undo_log=[], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, lock_path) - assert tx_id not in journal.list_all() - - async def test_recover_init_orphan_lock_other_owner(self, agfs_client, vector_store, test_dir): - """INIT with orphan lock owned by different tx → not removed.""" - orphan_dir = f"{test_dir}/orphan-other" - _mkdir_ok(agfs_client, orphan_dir) - - other_tx_id = f"tx-OTHER-{uuid.uuid4().hex[:8]}" - lock_path = make_lock_file(agfs_client, orphan_dir, other_tx_id, "S") - - tx_id = f"tx-innocent-{uuid.uuid4().hex[:8]}" - journal = TransactionJournal(agfs_client) - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.INIT, - locks=[], - init_info={ - "operation": "rm", - "lock_paths": [orphan_dir], - "lock_mode": "subtree", - }, - undo_log=[], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - # Lock file should still exist — owned by different tx - assert file_exists(agfs_client, lock_path) - assert tx_id not in journal.list_all() - - async def test_recover_mv_orphan_both_paths(self, agfs_client, vector_store, test_dir): - """INIT mv operation → check both lock_paths and mv_dst_path for orphan locks.""" - src_dir = f"{test_dir}/mv-orphan-src" - dst_dir = f"{test_dir}/mv-orphan-dst" - _mkdir_ok(agfs_client, src_dir) - _mkdir_ok(agfs_client, dst_dir) - - tx_id = f"tx-mv-orphan-{uuid.uuid4().hex[:8]}" - src_lock = make_lock_file(agfs_client, src_dir, tx_id, "S") - dst_lock = make_lock_file(agfs_client, dst_dir, tx_id, "P") - - journal = TransactionJournal(agfs_client) - record = TransactionRecord( - id=tx_id, - status=TransactionStatus.INIT, - locks=[], - init_info={ - "operation": "mv", - "lock_paths": [src_dir], - "lock_mode": "mv", - "mv_dst_path": dst_dir, - }, - undo_log=[], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - # Both orphan locks should be cleaned up - assert not file_exists(agfs_client, src_lock) - assert not file_exists(agfs_client, dst_lock) - assert tx_id not in journal.list_all() - - async def test_recover_multiple_transactions(self, agfs_client, vector_store, test_dir): - """Multiple journal entries are all recovered.""" - dir_a = f"{test_dir}/multi-tx-a" - _mkdir_ok(agfs_client, dir_a) - - journal = TransactionJournal(agfs_client) - - # tx-a: EXEC with mkdir → should rollback - tx_a = f"tx-multi-a-{uuid.uuid4().hex[:8]}" - record_a = TransactionRecord( - id=tx_a, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": dir_a}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record_a) - - # tx-b: COMMIT → no rollback, just cleanup - tx_b = f"tx-multi-b-{uuid.uuid4().hex[:8]}" - record_b = TransactionRecord( - id=tx_b, - status=TransactionStatus.COMMIT, - locks=[], - undo_log=[], - post_actions=[], - ) - _write_journal(journal, record_b) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - assert not file_exists(agfs_client, dir_a) # rolled back - assert tx_a not in journal.list_all() - assert tx_b not in journal.list_all() - - async def test_recover_corrupted_journal_skips(self, agfs_client, vector_store, test_dir): - """Corrupted journal entry → skipped, others still processed.""" - journal = TransactionJournal(agfs_client) - - # Write a corrupted journal entry (invalid JSON) - bad_tx_id = f"tx-bad-{uuid.uuid4().hex[:8]}" - _mkdir_ok(agfs_client, "/local/_system") - _mkdir_ok(agfs_client, "/local/_system/transactions") - bad_dir = f"/local/_system/transactions/{bad_tx_id}" - _mkdir_ok(agfs_client, bad_dir) - agfs_client.write(f"{bad_dir}/journal.json", b"NOT VALID JSON {{{{") - - # Write a good journal entry - good_dir = f"{test_dir}/good-recovery" - _mkdir_ok(agfs_client, good_dir) - - good_tx_id = f"tx-good-{uuid.uuid4().hex[:8]}" - record = TransactionRecord( - id=good_tx_id, - status=TransactionStatus.EXEC, - locks=[], - undo_log=[ - UndoEntry( - sequence=0, - op_type="fs_mkdir", - params={"uri": good_dir}, - completed=True, - ) - ], - post_actions=[], - ) - _write_journal(journal, record) - - manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store) - await manager._recover_pending_transactions() - - # Good tx should still be recovered - assert not file_exists(agfs_client, good_dir) - assert good_tx_id not in journal.list_all() diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py index d7b850c4..1c79414d 100644 --- a/tests/transaction/test_e2e.py +++ b/tests/transaction/test_e2e.py @@ -1,437 +1,125 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: Apache-2.0 -"""End-to-end transaction tests using real AGFS backend. +"""End-to-end lock tests using real AGFS backend. -These tests exercise the full stack: TransactionContext → TransactionManager → -PathLock → Journal → AGFS, verifying the complete acquire → operate → commit/rollback -→ release → journal cleanup lifecycle. +These tests exercise LockContext -> LockManager -> PathLock -> AGFS, +verifying the acquire -> operate -> release lifecycle. """ import uuid import pytest -from openviking.storage.transaction.context_manager import TransactionContext -from openviking.storage.transaction.journal import TransactionJournal +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.lock_context import LockContext +from openviking.storage.transaction.lock_manager import LockManager from openviking.storage.transaction.path_lock import LOCK_FILE_NAME -from openviking.storage.transaction.transaction_manager import TransactionManager -@pytest.fixture -def tx_manager(agfs_client): - """Create a real TransactionManager backed by the test AGFS.""" - manager = TransactionManager( - agfs_client=agfs_client, - timeout=3600, - max_parallel_locks=8, - lock_timeout=1.0, - lock_expire=1.0, - ) - return manager - - -class TestE2ECommit: - async def test_full_commit_lifecycle(self, agfs_client, tx_manager, test_dir): - """Full lifecycle: context enter → record undo → commit → locks released → journal cleaned.""" - async with TransactionContext( - tx_manager, "test_write", [test_dir], lock_mode="point" - ) as tx: - # Lock should be acquired - lock_path = f"{test_dir}/{LOCK_FILE_NAME}" - token = agfs_client.cat(lock_path) - assert token is not None +def _lock_file_gone(agfs_client, lock_path: str) -> bool: + """Return True if the lock file does not exist in AGFS.""" + try: + agfs_client.stat(lock_path) + return False + except Exception: + return True - # Record some operations - seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/file.txt"}) - agfs_client.write(f"{test_dir}/file.txt", b"hello") - tx.mark_completed(seq) - # Add post action - tx.add_post_action( - "enqueue_semantic", - {"uri": "viking://test", "context_type": "resource", "account_id": "default"}, - ) +@pytest.fixture +def lock_manager(agfs_client): + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) - await tx.commit() - # After commit: lock should be released - try: - agfs_client.cat(lock_path) - raise AssertionError("Lock file should be gone after commit") - except Exception: - pass # Expected +class TestLockContextCommit: + async def test_lock_acquired_and_released(self, agfs_client, lock_manager, test_dir): + """Lock is held inside the context and released after exit.""" + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + token = agfs_client.cat(lock_path) + assert token is not None - # Transaction should be removed from manager - assert tx_manager.get_transaction(tx.record.id) is None + assert _lock_file_gone(agfs_client, lock_path) - async def test_commit_file_persists(self, agfs_client, tx_manager, test_dir): - """Files written inside a committed transaction persist.""" + async def test_file_persists_after_context(self, agfs_client, lock_manager, test_dir): + """Files written inside a lock context persist.""" file_path = f"{test_dir}/committed-file.txt" - async with TransactionContext(tx_manager, "write_op", [test_dir], lock_mode="point") as tx: - seq = tx.record_undo("fs_write_new", {"uri": file_path}) + async with LockContext(lock_manager, [test_dir], lock_mode="point"): agfs_client.write(file_path, b"committed data") - tx.mark_completed(seq) - await tx.commit() content = agfs_client.cat(file_path) assert content == b"committed data" -class TestE2ERollback: - async def test_explicit_exception_triggers_rollback(self, agfs_client, tx_manager, test_dir): - """Exception inside context → auto-rollback → undo operations reversed.""" - new_dir = f"{test_dir}/to-be-rolled-back-{uuid.uuid4().hex}" +class TestLockContextException: + async def test_lock_released_on_exception(self, agfs_client, lock_manager, test_dir): + """Lock is released even when an exception occurs inside the context.""" + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "failing_op", [test_dir], lock_mode="point" - ) as tx: - seq = tx.record_undo("fs_mkdir", {"uri": new_dir}) - agfs_client.mkdir(new_dir) - tx.mark_completed(seq) - + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + token = agfs_client.cat(lock_path) + assert token is not None raise RuntimeError("simulated failure") - # Directory should be removed by rollback - try: - agfs_client.stat(new_dir) - raise AssertionError("Directory should be removed by rollback") - except Exception: - pass + assert _lock_file_gone(agfs_client, lock_path) - # Lock should be released - lock_path = f"{test_dir}/{LOCK_FILE_NAME}" - try: - agfs_client.cat(lock_path) - raise AssertionError("Lock should be released after rollback") - except Exception: - pass - - async def test_no_commit_triggers_rollback(self, agfs_client, tx_manager, test_dir): - """Exiting context without calling commit() triggers auto-rollback.""" - new_dir = f"{test_dir}/forgot-commit-{uuid.uuid4().hex}" - - async with TransactionContext(tx_manager, "no_commit", [test_dir], lock_mode="point") as tx: - seq = tx.record_undo("fs_mkdir", {"uri": new_dir}) - agfs_client.mkdir(new_dir) - tx.mark_completed(seq) - # Intentionally not calling tx.commit() - - # Directory should be removed by rollback - try: - agfs_client.stat(new_dir) - raise AssertionError("Directory should be removed by rollback") - except Exception: - pass - - -class TestE2EMvLock: - async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_dir): + async def test_exception_not_swallowed(self, agfs_client, lock_manager, test_dir): + """Exceptions propagate through the context manager.""" + with pytest.raises(ValueError, match="test error"): + async with LockContext(lock_manager, [test_dir], lock_mode="point"): + raise ValueError("test error") + + +class TestLockContextMv: + async def test_mv_lock_acquires_both_paths(self, agfs_client, lock_manager, test_dir): """mv lock mode acquires SUBTREE on both source and destination.""" src = f"{test_dir}/mv-src-{uuid.uuid4().hex}" dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}" agfs_client.mkdir(src) agfs_client.mkdir(dst) - async with TransactionContext( - tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst - ) as tx: - # Both lock files should exist + async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst): src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}") dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}") src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token + assert ":S" in src_token_str + assert ":S" in dst_token_str - assert ":S" in src_token_str # SUBTREE on source - assert ":S" in dst_token_str # SUBTREE on destination - - await tx.commit() - - # Both locks released for path in [f"{src}/{LOCK_FILE_NAME}", f"{dst}/{LOCK_FILE_NAME}"]: - try: - agfs_client.cat(path) - raise AssertionError(f"Lock {path} should be gone") - except Exception: - pass + assert _lock_file_gone(agfs_client, path) -class TestE2ESubtreeRollback: - async def test_subtree_lock_with_rollback(self, agfs_client, tx_manager, test_dir): - """Subtree lock + rollback: undo is executed and lock released.""" - target = f"{test_dir}/sub-rb-{uuid.uuid4().hex}" +class TestLockContextSubtree: + async def test_subtree_lock_and_release(self, agfs_client, lock_manager, test_dir): + """Subtree lock is acquired and released.""" + target = f"{test_dir}/sub-{uuid.uuid4().hex}" agfs_client.mkdir(target) - child = f"{target}/child-{uuid.uuid4().hex}" - - with pytest.raises(ValueError): - async with TransactionContext(tx_manager, "rm_op", [target], lock_mode="subtree") as tx: - seq = tx.record_undo("fs_mkdir", {"uri": child}) - agfs_client.mkdir(child) - tx.mark_completed(seq) - - raise ValueError("abort rm") - - # Child dir should be removed by rollback - try: - agfs_client.stat(child) - raise AssertionError("Child should be cleaned up") - except Exception: - pass - - # Lock released - try: - agfs_client.cat(f"{target}/{LOCK_FILE_NAME}") - raise AssertionError("Lock should be released") - except Exception: - pass - - -class TestE2EJournalCleanup: - async def test_journal_cleaned_after_commit(self, agfs_client, tx_manager, test_dir): - """After successful commit, the journal entry for the transaction is deleted.""" - journal = TransactionJournal(agfs_client) - - async with TransactionContext( - tx_manager, "journal_test", [test_dir], lock_mode="point" - ) as tx: - tx_id = tx.record.id - await tx.commit() - - # Journal should be cleaned up - all_ids = journal.list_all() - assert tx_id not in all_ids - - async def test_journal_cleaned_after_rollback(self, agfs_client, tx_manager, test_dir): - """After rollback, the journal entry is also cleaned up.""" - journal = TransactionJournal(agfs_client) - - with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "journal_rb", [test_dir], lock_mode="point" - ) as tx: - tx_id = tx.record.id - raise RuntimeError("force rollback") - - all_ids = journal.list_all() - assert tx_id not in all_ids - - -class TestE2EMvRollback: - async def test_mv_rollback_moves_file_back(self, agfs_client, tx_manager, test_dir): - """mv commit 前失败 → 文件被移回原位。""" - src = f"{test_dir}/mv-rb-src-{uuid.uuid4().hex}" - dst_parent = f"{test_dir}/mv-rb-dst-{uuid.uuid4().hex}" - agfs_client.mkdir(src) - agfs_client.mkdir(dst_parent) - - # Write a file inside src - agfs_client.write(f"{src}/data.txt", b"important") - - dst = f"{dst_parent}/moved" - - with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent - ) as tx: - seq = tx.record_undo("fs_mv", {"src": src, "dst": dst}) - agfs_client.mv(src, dst) - tx.mark_completed(seq) - - raise RuntimeError("abort after mv") - - # src should be restored (mv reversed: dst → src) - content = agfs_client.cat(f"{src}/data.txt") - assert content == b"important" - - # dst should no longer exist - try: - agfs_client.stat(dst) - raise AssertionError("dst should not exist after rollback") - except Exception: - pass - - async def test_mv_commit_persists(self, agfs_client, tx_manager, test_dir): - """mv commit 成功 → 文件在新位置,旧位置不存在。""" - src = f"{test_dir}/mv-ok-src-{uuid.uuid4().hex}" - dst_parent = f"{test_dir}/mv-ok-dst-{uuid.uuid4().hex}" - agfs_client.mkdir(src) - agfs_client.mkdir(dst_parent) - agfs_client.write(f"{src}/data.txt", b"moved-data") - - dst = f"{dst_parent}/moved" - - async with TransactionContext( - tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent - ) as tx: - seq = tx.record_undo("fs_mv", {"src": src, "dst": dst}) - agfs_client.mv(src, dst) - tx.mark_completed(seq) - await tx.commit() - - # File at new location - content = agfs_client.cat(f"{dst}/data.txt") - assert content == b"moved-data" - - # Old location gone - try: - agfs_client.stat(src) - raise AssertionError("src should not exist after committed mv") - except Exception: - pass - - -class TestE2EMultiStepRollback: - async def test_multi_step_rollback_reverses_all(self, agfs_client, tx_manager, test_dir): - """多步操作(mkdir + write + mkdir),中间失败 → 全部反序回滚。 - - 执行顺序:seq0 mkdir /a → seq1 write /a/f.txt → seq2 mkdir /a/sub - 在 seq2 完成后抛异常。 - 回滚顺序:seq2 rm /a/sub → seq1 rm /a/f.txt → seq0 rm /a - """ - dir_a = f"{test_dir}/multi-a-{uuid.uuid4().hex}" - file_f = f"{dir_a}/f.txt" - dir_sub = f"{dir_a}/sub" + async with LockContext(lock_manager, [target], lock_mode="subtree"): + token = agfs_client.cat(f"{target}/{LOCK_FILE_NAME}") + token_str = token.decode("utf-8") if isinstance(token, bytes) else token + assert ":S" in token_str - with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "multi_step", [test_dir], lock_mode="point" - ) as tx: - s0 = tx.record_undo("fs_mkdir", {"uri": dir_a}) - agfs_client.mkdir(dir_a) - tx.mark_completed(s0) - - s1 = tx.record_undo("fs_write_new", {"uri": file_f}) - agfs_client.write(file_f, b"content") - tx.mark_completed(s1) - - s2 = tx.record_undo("fs_mkdir", {"uri": dir_sub}) - agfs_client.mkdir(dir_sub) - tx.mark_completed(s2) - - raise RuntimeError("abort after all steps") - - # Everything should be cleaned up in reverse order - for path in [dir_sub, file_f, dir_a]: - try: - agfs_client.stat(path) - raise AssertionError(f"{path} should not exist after rollback") - except Exception: - pass + assert _lock_file_gone(agfs_client, f"{target}/{LOCK_FILE_NAME}") - async def test_partial_step_rollback(self, agfs_client, tx_manager, test_dir): - """两步操作,第二步执行到一半崩溃(未 mark_completed)→ 只回滚第一步。 - - seq0 mkdir (completed=True) → seq1 write (completed=False,异常在 mark 前抛出) - 回滚只处理 seq0。 - """ - dir_a = f"{test_dir}/partial-{uuid.uuid4().hex}" - file_f = f"{dir_a}/f.txt" - - with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "partial", [test_dir], lock_mode="point" - ) as tx: - s0 = tx.record_undo("fs_mkdir", {"uri": dir_a}) - agfs_client.mkdir(dir_a) - tx.mark_completed(s0) - - _s1 = tx.record_undo("fs_write_new", {"uri": file_f}) - agfs_client.write(file_f, b"half-done") - # NOT calling tx.mark_completed(s1) — simulates crash mid-operation - raise RuntimeError("crash before marking s1 completed") - - # dir_a (seq0, completed) should be rolled back - try: - agfs_client.stat(dir_a) - raise AssertionError("dir_a should be rolled back") - except Exception: - pass - - # file_f was written but undo entry not marked completed → not rolled back by normal mode - # However, file_f is inside dir_a which was removed, so it's gone too - - async def test_rollback_order_matters_nested_dirs(self, agfs_client, tx_manager, test_dir): - """嵌套目录回滚顺序:必须先删子目录再删父目录。 - - seq0 mkdir /parent → seq1 mkdir /parent/child - 回滚必须 seq1 (rm child) → seq0 (rm parent),否则 parent 非空删除失败。 - """ - parent = f"{test_dir}/nested-parent-{uuid.uuid4().hex}" - child = f"{parent}/child" - with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "nested", [test_dir], lock_mode="point" - ) as tx: - s0 = tx.record_undo("fs_mkdir", {"uri": parent}) - agfs_client.mkdir(parent) - tx.mark_completed(s0) - - s1 = tx.record_undo("fs_mkdir", {"uri": child}) - agfs_client.mkdir(child) - tx.mark_completed(s1) - - raise RuntimeError("abort nested") - - # Both gone (child first, then parent) - for path in [child, parent]: - try: - agfs_client.stat(path) - raise AssertionError(f"{path} should not exist") - except Exception: - pass - - async def test_rollback_failure_best_effort_continues(self, agfs_client, tx_manager, test_dir): - """回滚中某步失败,后续步骤仍然执行(best-effort)。 - - seq0 mkdir /a → seq1 mkdir /b - 手动删除 /b(模拟回滚 seq1 时目标已不存在),seq0 的回滚仍应执行。 - """ - dir_a = f"{test_dir}/be-a-{uuid.uuid4().hex}" - dir_b = f"{test_dir}/be-b-{uuid.uuid4().hex}" - - with pytest.raises(RuntimeError): - async with TransactionContext( - tx_manager, "best_effort", [test_dir], lock_mode="point" - ) as tx: - s0 = tx.record_undo("fs_mkdir", {"uri": dir_a}) - agfs_client.mkdir(dir_a) - tx.mark_completed(s0) - - s1 = tx.record_undo("fs_mkdir", {"uri": dir_b}) - agfs_client.mkdir(dir_b) - tx.mark_completed(s1) - - # Manually remove dir_b before rollback — simulates external interference - agfs_client.rm(dir_b) - - raise RuntimeError("abort") - - # dir_b removal during rollback "fails" (already gone), but dir_a should still be rolled back - try: - agfs_client.stat(dir_a) - raise AssertionError("dir_a should be rolled back despite dir_b failure") - except Exception: - pass - - -class TestE2ESequentialTransactions: - async def test_sequential_transactions_on_same_path(self, agfs_client, tx_manager, test_dir): - """Two sequential transactions on the same path both succeed.""" +class TestSequentialLocks: + async def test_sequential_locks_on_same_path(self, agfs_client, lock_manager, test_dir): + """Multiple sequential lock contexts on the same path succeed.""" for i in range(3): - async with TransactionContext( - tx_manager, f"seq_{i}", [test_dir], lock_mode="point" - ) as tx: - seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/f{i}.txt"}) + async with LockContext(lock_manager, [test_dir], lock_mode="point"): agfs_client.write(f"{test_dir}/f{i}.txt", f"data-{i}".encode()) - tx.mark_completed(seq) - await tx.commit() - # All files should exist for i in range(3): content = agfs_client.cat(f"{test_dir}/f{i}.txt") assert content == f"data-{i}".encode() - assert tx_manager.get_transaction_count() == 0 + async def test_lock_acquisition_failure(self, agfs_client, lock_manager, test_dir): + """LockContext raises LockAcquisitionError for nonexistent path.""" + nonexistent = f"{test_dir}/nonexistent-{uuid.uuid4().hex}" + with pytest.raises(LockAcquisitionError): + async with LockContext(lock_manager, [nonexistent], lock_mode="point"): + pass diff --git a/tests/transaction/test_journal.py b/tests/transaction/test_journal.py deleted file mode 100644 index 57f1e483..00000000 --- a/tests/transaction/test_journal.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Tests for transaction journal.""" - -import json -import uuid -from unittest.mock import MagicMock - -from openviking.storage.transaction.journal import TransactionJournal - - -class TestTransactionJournal: - def _make_journal(self) -> tuple: - agfs = MagicMock() - journal = TransactionJournal(agfs) - return journal, agfs - - def test_write_calls_agfs_write_with_correct_data(self): - journal, agfs = self._make_journal() - data = {"id": "tx-1", "status": "INIT", "locks": []} - - journal.write(data) - - # Should call agfs.write with the journal path and serialized data - agfs.write.assert_called_once() - path, payload = agfs.write.call_args[0] - assert "tx-1" in path - assert path.endswith("journal.json") - parsed = json.loads(payload.decode("utf-8")) - assert parsed["id"] == "tx-1" - assert parsed["status"] == "INIT" - - def test_write_ensures_directories_exist(self): - journal, agfs = self._make_journal() - data = {"id": "tx-1", "status": "INIT", "locks": []} - - journal.write(data) - - # Should call mkdir at least once (for parent dirs) - assert agfs.mkdir.called - - def test_update_overwrites(self): - journal, agfs = self._make_journal() - data = {"id": "tx-2", "status": "EXEC", "locks": []} - - journal.update(data) - - agfs.write.assert_called_once() - path, payload = agfs.write.call_args[0] - assert json.loads(payload.decode("utf-8"))["status"] == "EXEC" - - def test_read_parses_json(self): - journal, agfs = self._make_journal() - agfs.cat.return_value = json.dumps({"id": "tx-3", "status": "EXEC"}).encode("utf-8") - - result = journal.read("tx-3") - assert result["id"] == "tx-3" - assert result["status"] == "EXEC" - - def test_read_handles_string_response(self): - """Some AGFS backends may return str instead of bytes.""" - journal, agfs = self._make_journal() - agfs.cat.return_value = json.dumps({"id": "tx-str", "status": "INIT"}) - - result = journal.read("tx-str") - assert result["id"] == "tx-str" - - def test_delete_removes_directory(self): - journal, agfs = self._make_journal() - journal.delete("tx-4") - agfs.rm.assert_called_once() - path = agfs.rm.call_args[0][0] - assert "tx-4" in path - - def test_list_all_returns_tx_ids(self): - journal, agfs = self._make_journal() - agfs.ls.return_value = [ - {"name": "tx-a", "isDir": True}, - {"name": "tx-b", "isDir": True}, - {"name": ".", "isDir": True}, - ] - - result = journal.list_all() - assert "tx-a" in result - assert "tx-b" in result - assert "." not in result - - def test_list_all_filters_dotdot(self): - journal, agfs = self._make_journal() - agfs.ls.return_value = [ - {"name": "..", "isDir": True}, - {"name": "tx-real", "isDir": True}, - ] - - result = journal.list_all() - assert ".." not in result - assert "tx-real" in result - - def test_list_all_empty_on_error(self): - journal, agfs = self._make_journal() - agfs.ls.side_effect = Exception("not found") - - result = journal.list_all() - assert result == [] - - def test_delete_tolerates_missing(self): - journal, agfs = self._make_journal() - agfs.rm.side_effect = Exception("not found") - # Should not raise - journal.delete("tx-missing") - - def test_write_with_post_actions(self): - journal, agfs = self._make_journal() - data = { - "id": "tx-5", - "status": "COMMIT", - "locks": [], - "post_actions": [ - {"type": "enqueue_semantic", "params": {"uri": "viking://test"}}, - ], - } - journal.write(data) - path, payload = agfs.write.call_args[0] - parsed = json.loads(payload.decode("utf-8")) - assert len(parsed["post_actions"]) == 1 - assert parsed["post_actions"][0]["type"] == "enqueue_semantic" - - def test_write_with_undo_log(self): - journal, agfs = self._make_journal() - data = { - "id": "tx-6", - "status": "EXEC", - "locks": [], - "undo_log": [ - { - "sequence": 0, - "op_type": "fs_mv", - "params": {"src": "/a", "dst": "/b"}, - "completed": True, - }, - ], - } - journal.write(data) - _, payload = agfs.write.call_args[0] - parsed = json.loads(payload.decode("utf-8")) - assert len(parsed["undo_log"]) == 1 - assert parsed["undo_log"][0]["op_type"] == "fs_mv" - - -class TestTransactionJournalIntegration: - """Integration tests using real AGFS backend to verify persistence behavior.""" - - def test_write_read_roundtrip(self, agfs_client): - journal = TransactionJournal(agfs_client) - tx_id = f"tx-int-{uuid.uuid4().hex}" - data = {"id": tx_id, "status": "INIT", "locks": [], "undo_log": []} - - journal.write(data) - result = journal.read(tx_id) - - assert result["id"] == tx_id - assert result["status"] == "INIT" - - journal.delete(tx_id) - - def test_update_overwrites(self, agfs_client): - journal = TransactionJournal(agfs_client) - tx_id = f"tx-int-{uuid.uuid4().hex}" - - journal.write({"id": tx_id, "status": "INIT", "locks": []}) - journal.update({"id": tx_id, "status": "EXEC", "locks": []}) - - result = journal.read(tx_id) - assert result["status"] == "EXEC" - - journal.delete(tx_id) - - def test_delete_removes_journal(self, agfs_client): - journal = TransactionJournal(agfs_client) - tx_id = f"tx-int-{uuid.uuid4().hex}" - - journal.write({"id": tx_id, "status": "INIT", "locks": []}) - journal.delete(tx_id) - - try: - journal.read(tx_id) - raise AssertionError("Should have raised after deletion") - except Exception: - pass # Expected - - def test_list_all_returns_written_ids(self, agfs_client): - journal = TransactionJournal(agfs_client) - tx_id_a = f"tx-int-{uuid.uuid4().hex}" - tx_id_b = f"tx-int-{uuid.uuid4().hex}" - - journal.write({"id": tx_id_a, "status": "INIT", "locks": []}) - journal.write({"id": tx_id_b, "status": "INIT", "locks": []}) - - result = journal.list_all() - assert tx_id_a in result - assert tx_id_b in result - - journal.delete(tx_id_a) - journal.delete(tx_id_b) - - def test_list_all_empty_when_none(self, agfs_client): - """After cleanup, list_all should not include previously deleted entries.""" - journal = TransactionJournal(agfs_client) - tx_id = f"tx-int-{uuid.uuid4().hex}" - - journal.write({"id": tx_id, "status": "INIT", "locks": []}) - journal.delete(tx_id) - - result = journal.list_all() - assert tx_id not in result diff --git a/tests/transaction/test_lock_context.py b/tests/transaction/test_lock_context.py new file mode 100644 index 00000000..37fcb89c --- /dev/null +++ b/tests/transaction/test_lock_context.py @@ -0,0 +1,85 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for LockContext async context manager.""" + +import uuid + +import pytest + +from openviking.storage.errors import LockAcquisitionError +from openviking.storage.transaction.lock_context import LockContext +from openviking.storage.transaction.lock_manager import LockManager +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME + + +def _lock_file_gone(agfs_client, lock_path: str) -> bool: + try: + agfs_client.stat(lock_path) + return False + except Exception: + return True + + +@pytest.fixture +def lm(agfs_client): + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) + + +class TestLockContextPoint: + async def test_point_lock_lifecycle(self, agfs_client, lm, test_dir): + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + async with LockContext(lm, [test_dir], lock_mode="point") as handle: + assert handle is not None + token = agfs_client.cat(lock_path) + assert token is not None + + assert _lock_file_gone(agfs_client, lock_path) + + async def test_lock_released_on_exception(self, agfs_client, lm, test_dir): + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + + with pytest.raises(RuntimeError): + async with LockContext(lm, [test_dir], lock_mode="point"): + assert agfs_client.cat(lock_path) is not None + raise RuntimeError("fail") + + assert _lock_file_gone(agfs_client, lock_path) + + async def test_exception_propagates(self, lm, test_dir): + with pytest.raises(ValueError, match="test"): + async with LockContext(lm, [test_dir], lock_mode="point"): + raise ValueError("test") + + +class TestLockContextSubtree: + async def test_subtree_lock(self, agfs_client, lm, test_dir): + async with LockContext(lm, [test_dir], lock_mode="subtree"): + token = agfs_client.cat(f"{test_dir}/{LOCK_FILE_NAME}") + token_str = token.decode("utf-8") if isinstance(token, bytes) else token + assert ":S" in token_str + + +class TestLockContextMv: + async def test_mv_lock(self, agfs_client, lm, test_dir): + src = f"{test_dir}/src-{uuid.uuid4().hex}" + dst = f"{test_dir}/dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + async with LockContext(lm, [src], lock_mode="mv", mv_dst_path=dst) as handle: + assert len(handle.locks) == 2 + + +class TestLockContextFailure: + async def test_nonexistent_path_raises(self, lm): + with pytest.raises(LockAcquisitionError): + async with LockContext(lm, ["/local/nonexistent-xyz"], lock_mode="point"): + pass + + async def test_handle_cleaned_up_on_failure(self, lm): + with pytest.raises(LockAcquisitionError): + async with LockContext(lm, ["/local/nonexistent-xyz"], lock_mode="point"): + pass + + assert len(lm.get_active_handles()) == 0 diff --git a/tests/transaction/test_lock_manager.py b/tests/transaction/test_lock_manager.py new file mode 100644 index 00000000..e30f724b --- /dev/null +++ b/tests/transaction/test_lock_manager.py @@ -0,0 +1,88 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for LockManager.""" + +import uuid + +import pytest + +from openviking.storage.transaction.lock_manager import LockManager +from openviking.storage.transaction.path_lock import LOCK_FILE_NAME + + +def _lock_file_gone(agfs_client, lock_path: str) -> bool: + try: + agfs_client.stat(lock_path) + return False + except Exception: + return True + + +@pytest.fixture +def lm(agfs_client): + return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0) + + +class TestLockManagerBasic: + async def test_create_handle_and_acquire_point(self, agfs_client, lm, test_dir): + handle = lm.create_handle() + ok = await lm.acquire_point(handle, test_dir) + assert ok is True + + lock_path = f"{test_dir}/{LOCK_FILE_NAME}" + content = agfs_client.cat(lock_path) + assert content is not None + + await lm.release(handle) + assert _lock_file_gone(agfs_client, lock_path) + + async def test_acquire_subtree(self, agfs_client, lm, test_dir): + handle = lm.create_handle() + ok = await lm.acquire_subtree(handle, test_dir) + assert ok is True + + token = agfs_client.cat(f"{test_dir}/{LOCK_FILE_NAME}") + token_str = token.decode("utf-8") if isinstance(token, bytes) else token + assert ":S" in token_str + + await lm.release(handle) + + async def test_acquire_mv(self, agfs_client, lm, test_dir): + src = f"{test_dir}/mv-src-{uuid.uuid4().hex}" + dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}" + agfs_client.mkdir(src) + agfs_client.mkdir(dst) + + handle = lm.create_handle() + ok = await lm.acquire_mv(handle, src, dst) + assert ok is True + assert len(handle.locks) == 2 + + await lm.release(handle) + assert handle.id not in lm.get_active_handles() + + async def test_release_removes_from_active(self, lm, test_dir): + handle = lm.create_handle() + assert handle.id in lm.get_active_handles() + + await lm.acquire_point(handle, test_dir) + await lm.release(handle) + + assert handle.id not in lm.get_active_handles() + + async def test_stop_releases_all(self, agfs_client, lm, test_dir): + h1 = lm.create_handle() + h2 = lm.create_handle() + await lm.acquire_point(h1, test_dir) + + sub = f"{test_dir}/sub-{uuid.uuid4().hex}" + agfs_client.mkdir(sub) + await lm.acquire_point(h2, sub) + + await lm.stop() + assert len(lm.get_active_handles()) == 0 + + async def test_nonexistent_path_fails(self, lm): + handle = lm.create_handle() + ok = await lm.acquire_point(handle, "/local/nonexistent-xyz") + assert ok is False diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py index 2f3b6afc..0b721e07 100644 --- a/tests/transaction/test_path_lock.py +++ b/tests/transaction/test_path_lock.py @@ -5,6 +5,7 @@ import time from unittest.mock import MagicMock +from openviking.storage.transaction.lock_handle import LockHandle from openviking.storage.transaction.path_lock import ( LOCK_FILE_NAME, LOCK_TYPE_POINT, @@ -13,7 +14,6 @@ _make_fencing_token, _parse_fencing_token, ) -from openviking.storage.transaction.transaction_record import TransactionRecord class TestFencingToken: @@ -55,20 +55,20 @@ def test_tokens_are_unique(self): class TestPathLockStale: def test_is_lock_stale_no_file(self): agfs = MagicMock() - agfs.cat.side_effect = Exception("not found") + agfs.read.side_effect = Exception("not found") lock = PathLock(agfs) assert lock.is_lock_stale("/test/.path.ovlock") is True def test_is_lock_stale_legacy_token(self): agfs = MagicMock() - agfs.cat.return_value = b"tx-old-format" + agfs.read.return_value = b"tx-old-format" lock = PathLock(agfs) assert lock.is_lock_stale("/test/.path.ovlock") is True def test_is_lock_stale_recent_token(self): agfs = MagicMock() token = _make_fencing_token("tx-1") - agfs.cat.return_value = token.encode("utf-8") + agfs.read.return_value = token.encode("utf-8") lock = PathLock(agfs) assert lock.is_lock_stale("/test/.path.ovlock", expire_seconds=300.0) is False @@ -78,7 +78,7 @@ class TestPathLockBehavior: async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir): lock = PathLock(agfs_client) - tx = TransactionRecord(id="tx-point-1") + tx = LockHandle(id="tx-point-1") ok = await lock.acquire_point(test_dir, tx, timeout=3.0) assert ok is True @@ -93,7 +93,7 @@ async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir): async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir): lock = PathLock(agfs_client) - tx = TransactionRecord(id="tx-subtree-1") + tx = LockHandle(id="tx-subtree-1") ok = await lock.acquire_subtree(test_dir, tx, timeout=3.0) assert ok is True @@ -108,7 +108,7 @@ async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir): async def test_acquire_point_dir_not_found(self, agfs_client): lock = PathLock(agfs_client) - tx = TransactionRecord(id="tx-no-dir") + tx = LockHandle(id="tx-no-dir") ok = await lock.acquire_point("/local/nonexistent-path-xyz", tx, timeout=0.5) assert ok is False @@ -116,30 +116,32 @@ async def test_acquire_point_dir_not_found(self, agfs_client): async def test_release_removes_lock_file(self, agfs_client, test_dir): lock = PathLock(agfs_client) - tx = TransactionRecord(id="tx-release-1") + tx = LockHandle(id="tx-release-1") await lock.acquire_point(test_dir, tx, timeout=3.0) lock_path = f"{test_dir}/{LOCK_FILE_NAME}" await lock.release(tx) - # Lock file should be gone + # Lock file should be gone (use stat, not cat — cat returns b'' for deleted files) try: - agfs_client.cat(lock_path) + agfs_client.stat(lock_path) raise AssertionError("Lock file should have been removed") + except AssertionError: + raise except Exception: pass # Expected: file not found async def test_sequential_acquire_works(self, agfs_client, test_dir): lock = PathLock(agfs_client) - tx1 = TransactionRecord(id="tx-seq-1") + tx1 = LockHandle(id="tx-seq-1") ok1 = await lock.acquire_point(test_dir, tx1, timeout=3.0) assert ok1 is True await lock.release(tx1) - tx2 = TransactionRecord(id="tx-seq-2") + tx2 = LockHandle(id="tx-seq-2") ok2 = await lock.acquire_point(test_dir, tx2, timeout=3.0) assert ok2 is True @@ -153,11 +155,11 @@ async def test_point_blocked_by_ancestor_subtree(self, agfs_client, test_dir): agfs_client.mkdir(child) lock = PathLock(agfs_client) - tx_parent = TransactionRecord(id="tx-parent-subtree") + tx_parent = LockHandle(id="tx-parent-subtree") ok = await lock.acquire_subtree(test_dir, tx_parent, timeout=3.0) assert ok is True - tx_child = TransactionRecord(id="tx-child-point") + tx_child = LockHandle(id="tx-child-point") blocked = await lock.acquire_point(child, tx_child, timeout=0.5) assert blocked is False @@ -171,11 +173,11 @@ async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir): agfs_client.mkdir(child) lock = PathLock(agfs_client) - tx_child = TransactionRecord(id="tx-desc-point") + tx_child = LockHandle(id="tx-desc-point") ok = await lock.acquire_point(child, tx_child, timeout=3.0) assert ok is True - tx_parent = TransactionRecord(id="tx-parent-sub") + tx_parent = LockHandle(id="tx-parent-sub") blocked = await lock.acquire_subtree(test_dir, tx_parent, timeout=0.5) assert blocked is False @@ -191,7 +193,7 @@ async def test_acquire_mv_creates_subtree_locks(self, agfs_client, test_dir): agfs_client.mkdir(dst) lock = PathLock(agfs_client) - tx = TransactionRecord(id="tx-mv-1") + tx = LockHandle(id="tx-mv-1") ok = await lock.acquire_mv(src, dst, tx, timeout=3.0) assert ok is True @@ -223,8 +225,8 @@ async def test_point_does_not_block_sibling_point(self, agfs_client, test_dir): agfs_client.mkdir(dir_b) lock = PathLock(agfs_client) - tx_a = TransactionRecord(id="tx-sib-a") - tx_b = TransactionRecord(id="tx-sib-b") + tx_a = LockHandle(id="tx-sib-a") + tx_b = LockHandle(id="tx-sib-b") ok_a = await lock.acquire_point(dir_a, tx_a, timeout=3.0) ok_b = await lock.acquire_point(dir_b, tx_b, timeout=3.0) @@ -251,7 +253,7 @@ async def test_stale_lock_auto_removed_on_acquire(self, agfs_client, test_dir): # New transaction should succeed by auto-removing the stale lock lock = PathLock(agfs_client, lock_expire=300.0) - tx = TransactionRecord(id="tx-new-owner") + tx = LockHandle(id="tx-new-owner") ok = await lock.acquire_point(target, tx, timeout=2.0) assert ok is True @@ -276,7 +278,7 @@ async def test_stale_subtree_ancestor_auto_removed(self, agfs_client, test_dir): agfs_client.write(parent_lock, stale_token.encode("utf-8")) lock = PathLock(agfs_client, lock_expire=300.0) - tx = TransactionRecord(id="tx-child-new") + tx = LockHandle(id="tx-child-new") ok = await lock.acquire_point(child, tx, timeout=2.0) assert ok is True @@ -295,12 +297,12 @@ async def test_point_same_path_no_wait_fails_immediately(self, agfs_client, test agfs_client.mkdir(target) lock = PathLock(agfs_client) - tx1 = TransactionRecord(id="tx-hold") + tx1 = LockHandle(id="tx-hold") ok1 = await lock.acquire_point(target, tx1, timeout=3.0) assert ok1 is True # Second acquire with timeout=0 should fail immediately - tx2 = TransactionRecord(id="tx-blocked") + tx2 = LockHandle(id="tx-blocked") t0 = time.monotonic() ok2 = await lock.acquire_point(target, tx2, timeout=0.0) elapsed = time.monotonic() - t0 @@ -318,11 +320,11 @@ async def test_subtree_same_path_mutual_exclusion(self, agfs_client, test_dir): agfs_client.mkdir(target) lock = PathLock(agfs_client) - tx1 = TransactionRecord(id="tx-sub1") + tx1 = LockHandle(id="tx-sub1") ok1 = await lock.acquire_subtree(target, tx1, timeout=3.0) assert ok1 is True - tx2 = TransactionRecord(id="tx-sub2") + tx2 = LockHandle(id="tx-sub2") ok2 = await lock.acquire_subtree(target, tx2, timeout=0.5) assert ok2 is False diff --git a/tests/transaction/test_post_actions.py b/tests/transaction/test_post_actions.py deleted file mode 100644 index 2ae3c12b..00000000 --- a/tests/transaction/test_post_actions.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Tests for post_actions execution and replay.""" - -from unittest.mock import AsyncMock, MagicMock, patch - -from openviking.storage.transaction.transaction_manager import TransactionManager - - -class TestPostActions: - def _make_manager(self): - agfs = MagicMock() - manager = TransactionManager(agfs_client=agfs, timeout=3600) - manager._journal = MagicMock() - return manager, agfs - - async def test_execute_enqueue_semantic(self): - manager, _ = self._make_manager() - - mock_queue = AsyncMock() - mock_queue_manager = MagicMock() - mock_queue_manager.get_queue.return_value = mock_queue - - with patch( - "openviking.storage.queuefs.get_queue_manager", - return_value=mock_queue_manager, - ): - await manager._execute_post_actions( - [ - { - "type": "enqueue_semantic", - "params": { - "uri": "viking://resources/test", - "context_type": "resource", - "account_id": "acc-1", - }, - } - ] - ) - - mock_queue.enqueue.assert_called_once() - msg = mock_queue.enqueue.call_args[0][0] - assert msg.uri == "viking://resources/test" - assert msg.context_type == "resource" - assert msg.account_id == "acc-1" - - async def test_execute_unknown_action_logged(self): - manager, _ = self._make_manager() - # Should not raise, just log - await manager._execute_post_actions( - [ - {"type": "unknown_action", "params": {}}, - ] - ) - - async def test_execute_multiple_actions(self): - manager, _ = self._make_manager() - - mock_queue = AsyncMock() - mock_queue_manager = MagicMock() - mock_queue_manager.get_queue.return_value = mock_queue - - with patch( - "openviking.storage.queuefs.get_queue_manager", - return_value=mock_queue_manager, - ): - await manager._execute_post_actions( - [ - { - "type": "enqueue_semantic", - "params": { - "uri": "viking://a", - "context_type": "resource", - "account_id": "acc-1", - }, - }, - { - "type": "enqueue_semantic", - "params": { - "uri": "viking://b", - "context_type": "memory", - "account_id": "acc-2", - }, - }, - ] - ) - - assert mock_queue.enqueue.call_count == 2 - - async def test_post_action_failure_does_not_crash(self): - manager, _ = self._make_manager() - - mock_queue_manager = MagicMock() - mock_queue_manager.get_queue.side_effect = Exception("queue not available") - - with patch( - "openviking.storage.queuefs.get_queue_manager", - return_value=mock_queue_manager, - ): - # Should not raise - await manager._execute_post_actions( - [ - { - "type": "enqueue_semantic", - "params": { - "uri": "viking://test", - "context_type": "resource", - "account_id": "", - }, - }, - ] - ) diff --git a/tests/transaction/test_redo_log.py b/tests/transaction/test_redo_log.py new file mode 100644 index 00000000..8a0def2c --- /dev/null +++ b/tests/transaction/test_redo_log.py @@ -0,0 +1,78 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for RedoLog crash recovery.""" + +import uuid + +import pytest + +from openviking.storage.transaction.redo_log import RedoLog + + +@pytest.fixture +def redo(agfs_client): + return RedoLog(agfs_client) + + +class TestRedoLogBasic: + def test_write_and_read(self, redo): + task_id = uuid.uuid4().hex + info = {"archive_uri": "viking://test/archive", "session_uri": "viking://test/session"} + redo.write_pending(task_id, info) + + result = redo.read(task_id) + assert result["archive_uri"] == "viking://test/archive" + assert result["session_uri"] == "viking://test/session" + + redo.mark_done(task_id) + + def test_list_pending(self, redo): + t1 = uuid.uuid4().hex + t2 = uuid.uuid4().hex + redo.write_pending(t1, {"key": "v1"}) + redo.write_pending(t2, {"key": "v2"}) + + pending = redo.list_pending() + assert t1 in pending + assert t2 in pending + + redo.mark_done(t1) + pending_after = redo.list_pending() + assert t1 not in pending_after + assert t2 in pending_after + + redo.mark_done(t2) + + def test_mark_done_removes_task(self, redo): + task_id = uuid.uuid4().hex + redo.write_pending(task_id, {"x": 1}) + redo.mark_done(task_id) + + pending = redo.list_pending() + assert task_id not in pending + + def test_read_nonexistent_returns_empty(self, redo): + result = redo.read("nonexistent-task-id") + assert result == {} + + def test_list_pending_empty(self, redo): + # Should not crash even if _REDO_ROOT doesn't exist yet + pending = redo.list_pending() + assert isinstance(pending, list) + + def test_mark_done_idempotent(self, redo): + task_id = uuid.uuid4().hex + redo.write_pending(task_id, {"x": 1}) + redo.mark_done(task_id) + # Second mark_done should not raise + redo.mark_done(task_id) + + def test_overwrite_pending(self, redo): + task_id = uuid.uuid4().hex + redo.write_pending(task_id, {"version": 1}) + redo.write_pending(task_id, {"version": 2}) + + result = redo.read(task_id) + assert result["version"] == 2 + + redo.mark_done(task_id) diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py deleted file mode 100644 index 604b5f50..00000000 --- a/tests/transaction/test_rm_rollback.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Integration tests: multi-step rollback covering FS + VectorDB coordination.""" - -import uuid - -from openviking.storage.transaction.undo import UndoEntry, execute_rollback - -from .conftest import VECTOR_DIM, _mkdir_ok, file_exists - - -class TestRmRollback: - async def test_fs_rm_not_reversible(self, agfs_client, test_dir): - """fs_rm is intentionally irreversible: even completed=True is a no-op.""" - path = f"{test_dir}/rm-target" - _mkdir_ok(agfs_client, path) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True), - ] - await execute_rollback(undo_log, agfs_client) - - # Directory still exists — fs_rm rollback does nothing - assert file_exists(agfs_client, path) - - -class TestMvRollback: - async def test_mv_reversed_on_rollback(self, agfs_client, test_dir): - """Real mv → rollback → content back at original location.""" - src = f"{test_dir}/mv-src" - dst = f"{test_dir}/mv-dst" - _mkdir_ok(agfs_client, src) - agfs_client.write(f"{src}/payload.txt", b"important data") - - # Forward mv - agfs_client.mv(src, dst) - assert not file_exists(agfs_client, src) - content = agfs_client.cat(f"{dst}/payload.txt") - assert content == b"important data" - - undo_log = [ - UndoEntry( - sequence=0, - op_type="fs_mv", - params={"src": src, "dst": dst}, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client) - - assert file_exists(agfs_client, src) - restored = agfs_client.cat(f"{src}/payload.txt") - assert restored == b"important data" - - -class TestRecoverAll: - async def test_recover_all_reverses_incomplete(self, agfs_client, test_dir): - """recover_all=True also reverses entries with completed=False.""" - new_dir = f"{test_dir}/recover-all-dir" - _mkdir_ok(agfs_client, new_dir) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), - ] - await execute_rollback(undo_log, agfs_client, recover_all=True) - - assert not file_exists(agfs_client, new_dir) - - async def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir): - """recover_all=False skips entries with completed=False.""" - new_dir = f"{test_dir}/skip-incomplete" - _mkdir_ok(agfs_client, new_dir) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), - ] - await execute_rollback(undo_log, agfs_client, recover_all=False) - - assert file_exists(agfs_client, new_dir) - - -class TestMultiStepRollback: - async def test_reverse_order_nested_dirs(self, agfs_client, test_dir): - """parent + child → rollback reverses in reverse sequence order.""" - parent = f"{test_dir}/multi-parent" - child = f"{test_dir}/multi-parent/child" - _mkdir_ok(agfs_client, parent) - _mkdir_ok(agfs_client, child) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True), - UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True), - ] - await execute_rollback(undo_log, agfs_client) - - assert not file_exists(agfs_client, child) - assert not file_exists(agfs_client, parent) - - async def test_write_new_rollback(self, agfs_client, test_dir): - """New file → rollback → file deleted.""" - file_path = f"{test_dir}/new-file.txt" - agfs_client.write(file_path, b"new content") - assert file_exists(agfs_client, file_path) - - undo_log = [ - UndoEntry( - sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True - ), - ] - await execute_rollback(undo_log, agfs_client) - - assert not file_exists(agfs_client, file_path) - - async def test_best_effort_continues(self, agfs_client, test_dir): - """If one step fails, subsequent steps still execute.""" - real_dir = f"{test_dir}/best-effort-real" - _mkdir_ok(agfs_client, real_dir) - - undo_log = [ - # seq=0: mkdir rollback on real dir → should succeed - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": real_dir}, completed=True), - # seq=1: mkdir rollback on nonexistent dir → fails silently - UndoEntry( - sequence=1, - op_type="fs_mkdir", - params={"uri": f"{test_dir}/no-such-dir-{uuid.uuid4().hex}"}, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client) - - # seq=0 still executed despite seq=1 failure (reversed order: 1 runs first, then 0) - assert not file_exists(agfs_client, real_dir) - - async def test_unknown_op_type_no_crash(self, agfs_client, test_dir): - """Unknown op_type is logged but doesn't raise.""" - undo_log = [ - UndoEntry( - sequence=0, - op_type="some_future_op", - params={"foo": "bar"}, - completed=True, - ), - ] - # Should not raise - await execute_rollback(undo_log, agfs_client) - - -class TestVectorDBRollback: - async def test_vectordb_delete_rollback_restores(self, agfs_client, vector_store, request_ctx): - """upsert → delete → rollback(vectordb_delete) → record restored.""" - record_id = str(uuid.uuid4()) - record = { - "id": record_id, - "uri": f"viking://resources/del-restore-{record_id}.md", - "parent_uri": "viking://resources/", - "account_id": "default", - "context_type": "resource", - "level": 2, - "vector": [0.3] * VECTOR_DIM, - "name": "del-restore", - "description": "test", - "abstract": "test", - } - await vector_store.upsert(record, ctx=request_ctx) - - # Snapshot before delete - snapshot = await vector_store.get([record_id], ctx=request_ctx) - assert len(snapshot) == 1 - - # Forward: delete - await vector_store.delete([record_id], ctx=request_ctx) - assert len(await vector_store.get([record_id], ctx=request_ctx)) == 0 - - undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_delete", - params={ - "uris": [record["uri"]], - "records_snapshot": snapshot, - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client, vector_store=vector_store) - - results = await vector_store.get([record_id], ctx=request_ctx) - assert len(results) == 1 - - async def test_vectordb_delete_multi_record(self, agfs_client, vector_store, request_ctx): - """3 records in snapshot → rollback → all restored.""" - records = [] - for i in range(3): - rid = str(uuid.uuid4()) - rec = { - "id": rid, - "uri": f"viking://resources/multi-{rid}.md", - "parent_uri": "viking://resources/", - "account_id": "default", - "context_type": "resource", - "level": 2, - "vector": [0.1 * (i + 1)] * VECTOR_DIM, - "name": f"multi-{i}", - "description": "test", - "abstract": "test", - } - await vector_store.upsert(rec, ctx=request_ctx) - records.append(rec) - - ids = [r["id"] for r in records] - snapshot = await vector_store.get(ids, ctx=request_ctx) - assert len(snapshot) == 3 - - # Delete all - await vector_store.delete(ids, ctx=request_ctx) - assert len(await vector_store.get(ids, ctx=request_ctx)) == 0 - - undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_delete", - params={ - "uris": [r["uri"] for r in records], - "records_snapshot": snapshot, - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client, vector_store=vector_store) - - results = await vector_store.get(ids, ctx=request_ctx) - assert len(results) == 3 - - async def test_vectordb_delete_empty_snapshot(self, agfs_client, vector_store, request_ctx): - """Empty snapshot → no-op, no error.""" - undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_delete", - params={ - "uris": [], - "records_snapshot": [], - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ), - ] - # Should not raise - await execute_rollback(undo_log, agfs_client, vector_store=vector_store) - - async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, request_ctx): - """upsert → rollback(vectordb_upsert) → record deleted.""" - record_id = str(uuid.uuid4()) - record = { - "id": record_id, - "uri": f"viking://resources/upsert-del-{record_id}.md", - "parent_uri": "viking://resources/", - "account_id": "default", - "context_type": "resource", - "level": 2, - "vector": [0.4] * VECTOR_DIM, - "name": "upsert-del", - "description": "test", - "abstract": "test", - } - await vector_store.upsert(record, ctx=request_ctx) - assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1 - - undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_upsert", - params={ - "record_id": record_id, - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client, vector_store=vector_store) - - results = await vector_store.get([record_id], ctx=request_ctx) - assert len(results) == 0 diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py deleted file mode 100644 index ef0f0b3e..00000000 --- a/tests/transaction/test_transaction_manager.py +++ /dev/null @@ -1,323 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Tests for TransactionManager: CRUD, lifecycle, commit/rollback flows, timeout cleanup.""" - -import time -from unittest.mock import AsyncMock, MagicMock, patch - -from openviking.storage.transaction.transaction_manager import TransactionManager -from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus - - -def _make_manager(**kwargs): - """Create a TransactionManager with mocked AGFS and journal.""" - agfs = MagicMock() - defaults = {"agfs_client": agfs, "timeout": 3600, "lock_timeout": 0.0, "lock_expire": 300.0} - defaults.update(kwargs) - manager = TransactionManager(**defaults) - manager._journal = MagicMock() - manager._journal.list_all.return_value = [] - return manager, agfs - - -class TestCreateAndGet: - def test_create_transaction_returns_record(self): - manager, _ = _make_manager() - tx = manager.create_transaction(init_info={"operation": "rm"}) - assert isinstance(tx, TransactionRecord) - assert tx.status == TransactionStatus.INIT - assert tx.init_info == {"operation": "rm"} - - def test_create_assigns_unique_ids(self): - manager, _ = _make_manager() - tx1 = manager.create_transaction() - tx2 = manager.create_transaction() - assert tx1.id != tx2.id - - def test_get_transaction_found(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - assert manager.get_transaction(tx.id) is tx - - def test_get_transaction_not_found(self): - manager, _ = _make_manager() - assert manager.get_transaction("nonexistent") is None - - def test_get_transaction_count(self): - manager, _ = _make_manager() - assert manager.get_transaction_count() == 0 - manager.create_transaction() - assert manager.get_transaction_count() == 1 - manager.create_transaction() - assert manager.get_transaction_count() == 2 - - def test_get_active_transactions(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - active = manager.get_active_transactions() - assert tx.id in active - # Returned copy, not the internal dict - active.pop(tx.id) - assert manager.get_transaction(tx.id) is tx - - -class TestBegin: - async def test_begin_updates_status(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - ok = await manager.begin(tx.id) - assert ok is True - assert tx.status == TransactionStatus.ACQUIRE - - async def test_begin_unknown_tx(self): - manager, _ = _make_manager() - ok = await manager.begin("unknown-tx") - assert ok is False - - -class TestCommitFlow: - async def test_commit_full_lifecycle(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - - # Simulate lock acquisition - tx.update_status(TransactionStatus.EXEC) - tx.add_lock("/test/.path.ovlock") - - ok = await manager.commit(tx.id) - assert ok is True - assert tx.status == TransactionStatus.RELEASED - # Removed from active transactions - assert manager.get_transaction(tx.id) is None - # Journal cleaned up - manager._journal.delete.assert_called_once_with(tx.id) - - async def test_commit_persists_journal_before_release(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - - call_order = [] - original_update = manager._journal.update - - def track_update(data): - call_order.append(("journal_update", data.get("status"))) - return original_update(data) - - manager._journal.update = track_update - manager._journal.delete = MagicMock( - side_effect=lambda _: call_order.append(("journal_delete",)) - ) - - await manager.commit(tx.id) - # Journal update (COMMIT) happens before delete - assert call_order[0] == ("journal_update", "COMMIT") - - async def test_commit_executes_post_actions(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - tx.post_actions.append({"type": "enqueue_semantic", "params": {"uri": "viking://x"}}) - - with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post: - await manager.commit(tx.id) - mock_post.assert_called_once() - - async def test_commit_unknown_tx(self): - manager, _ = _make_manager() - ok = await manager.commit("nonexistent") - assert ok is False - - async def test_commit_releases_locks(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - tx.add_lock("/a/.path.ovlock") - tx.add_lock("/b/.path.ovlock") - - with patch.object(manager._path_lock, "release", new_callable=AsyncMock) as mock_release: - await manager.commit(tx.id) - mock_release.assert_called_once() - - -class TestRollbackFlow: - async def test_rollback_executes_undo_log(self): - manager, agfs = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - - from openviking.storage.transaction.undo import UndoEntry - - tx.undo_log.append( - UndoEntry( - sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True - ) - ) - - ok = await manager.rollback(tx.id) - assert ok is True - assert tx.status == TransactionStatus.RELEASED - agfs.mv.assert_called_once_with("/b", "/a") - - async def test_rollback_removes_from_active(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - - await manager.rollback(tx.id) - assert manager.get_transaction(tx.id) is None - - async def test_rollback_cleans_journal(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - - await manager.rollback(tx.id) - manager._journal.delete.assert_called_once_with(tx.id) - - async def test_rollback_unknown_tx(self): - manager, _ = _make_manager() - ok = await manager.rollback("nonexistent") - assert ok is False - - async def test_rollback_undo_failure_does_not_prevent_cleanup(self): - """Undo failure is best-effort; lock release and journal cleanup still happen.""" - manager, agfs = _make_manager() - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - - from openviking.storage.transaction.undo import UndoEntry - - tx.undo_log.append( - UndoEntry( - sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True - ) - ) - agfs.mv.side_effect = Exception("disk error") - - ok = await manager.rollback(tx.id) - assert ok is True - manager._journal.delete.assert_called_once() - - -class TestLockAcquisitionWrappers: - async def test_acquire_lock_point_success_transitions_to_exec(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - - with patch.object( - manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=True - ): - ok = await manager.acquire_lock_point(tx.id, "/test") - assert ok is True - assert tx.status == TransactionStatus.EXEC - - async def test_acquire_lock_point_failure_transitions_to_fail(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - - with patch.object( - manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=False - ): - ok = await manager.acquire_lock_point(tx.id, "/test") - assert ok is False - assert tx.status == TransactionStatus.FAIL - - async def test_acquire_lock_subtree_success(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - - with patch.object( - manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True - ): - ok = await manager.acquire_lock_subtree(tx.id, "/test") - assert ok is True - assert tx.status == TransactionStatus.EXEC - - async def test_acquire_lock_subtree_uses_config_timeout(self): - manager, _ = _make_manager(lock_timeout=5.0) - tx = manager.create_transaction() - - with patch.object( - manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True - ) as mock_acquire: - await manager.acquire_lock_subtree(tx.id, "/test") - mock_acquire.assert_called_once_with("/test", tx, timeout=5.0) - - async def test_acquire_lock_subtree_override_timeout(self): - manager, _ = _make_manager(lock_timeout=5.0) - tx = manager.create_transaction() - - with patch.object( - manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True - ) as mock_acquire: - await manager.acquire_lock_subtree(tx.id, "/test", timeout=10.0) - mock_acquire.assert_called_once_with("/test", tx, timeout=10.0) - - async def test_acquire_lock_mv_success(self): - manager, _ = _make_manager() - tx = manager.create_transaction() - - with patch.object( - manager._path_lock, "acquire_mv", new_callable=AsyncMock, return_value=True - ): - ok = await manager.acquire_lock_mv(tx.id, "/src", "/dst") - assert ok is True - assert tx.status == TransactionStatus.EXEC - - async def test_acquire_lock_unknown_tx(self): - manager, _ = _make_manager() - ok = await manager.acquire_lock_point("nonexistent", "/test") - assert ok is False - - -class TestLifecycle: - async def test_start_sets_running(self): - manager, _ = _make_manager() - await manager.start() - assert manager._running is True - manager.stop() - - async def test_start_idempotent(self): - manager, _ = _make_manager() - await manager.start() - await manager.start() # Should not error - assert manager._running is True - await manager.stop() - - async def test_stop_clears_state(self): - manager, _ = _make_manager() - await manager.start() - manager.create_transaction() - await manager.stop() - assert manager._running is False - assert manager.get_transaction_count() == 0 - - async def test_stop_idempotent(self): - manager, _ = _make_manager() - await manager.stop() - await manager.stop() # Should not error - - -class TestTimeoutCleanup: - async def test_cleanup_timed_out_rolls_back(self): - manager, _ = _make_manager(timeout=1) - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - # Simulate old updated_at - tx.updated_at = time.time() - 10 - - with patch.object( - manager, "rollback", new_callable=AsyncMock, return_value=True - ) as mock_rb: - await manager._cleanup_timed_out() - mock_rb.assert_called_once_with(tx.id) - - async def test_cleanup_skips_fresh_transactions(self): - manager, _ = _make_manager(timeout=3600) - tx = manager.create_transaction() - tx.update_status(TransactionStatus.EXEC) - - with patch.object(manager, "rollback", new_callable=AsyncMock) as mock_rb: - await manager._cleanup_timed_out() - mock_rb.assert_not_called() diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py deleted file mode 100644 index aff57887..00000000 --- a/tests/transaction/test_undo.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -"""Tests for undo log and rollback executor.""" - -import uuid - -from openviking.storage.transaction.undo import UndoEntry, execute_rollback - -from .conftest import VECTOR_DIM, _mkdir_ok, file_exists - - -class TestUndoEntry: - def test_to_dict(self): - entry = UndoEntry(sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}) - d = entry.to_dict() - assert d["sequence"] == 0 - assert d["op_type"] == "fs_mv" - assert d["params"] == {"src": "/a", "dst": "/b"} - assert d["completed"] is False - - def test_from_dict(self): - data = {"sequence": 1, "op_type": "fs_rm", "params": {"uri": "/x"}, "completed": True} - entry = UndoEntry.from_dict(data) - assert entry.sequence == 1 - assert entry.op_type == "fs_rm" - assert entry.completed is True - - def test_roundtrip(self): - entry = UndoEntry( - sequence=5, op_type="vectordb_upsert", params={"record_id": "r1"}, completed=True - ) - restored = UndoEntry.from_dict(entry.to_dict()) - assert restored.sequence == entry.sequence - assert restored.op_type == entry.op_type - assert restored.params == entry.params - assert restored.completed == entry.completed - - -class TestExecuteRollback: - """Integration tests for execute_rollback using real AGFS and VectorDB backends.""" - - async def test_rollback_fs_mv(self, agfs_client, test_dir): - src = f"{test_dir}/src" - dst = f"{test_dir}/dst" - _mkdir_ok(agfs_client, src) - agfs_client.write(f"{src}/data.txt", b"hello") - - # Forward: mv src → dst - agfs_client.mv(src, dst) - assert not file_exists(agfs_client, src) - assert file_exists(agfs_client, dst) - - undo_log = [ - UndoEntry( - sequence=0, - op_type="fs_mv", - params={"src": src, "dst": dst}, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client) - - # src restored, dst gone - assert file_exists(agfs_client, src) - assert not file_exists(agfs_client, dst) - - async def test_rollback_fs_rm_skipped(self, agfs_client, test_dir): - path = f"{test_dir}/will-not-delete" - _mkdir_ok(agfs_client, path) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True), - ] - await execute_rollback(undo_log, agfs_client) - - # fs_rm rollback is a no-op; directory still exists - assert file_exists(agfs_client, path) - - async def test_rollback_fs_mkdir(self, agfs_client, test_dir): - new_dir = f"{test_dir}/created" - _mkdir_ok(agfs_client, new_dir) - assert file_exists(agfs_client, new_dir) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=True), - ] - await execute_rollback(undo_log, agfs_client) - - assert not file_exists(agfs_client, new_dir) - - async def test_rollback_fs_write_new(self, agfs_client, test_dir): - file_path = f"{test_dir}/new-file.txt" - agfs_client.write(file_path, b"content") - assert file_exists(agfs_client, file_path) - - undo_log = [ - UndoEntry( - sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True - ), - ] - await execute_rollback(undo_log, agfs_client) - - assert not file_exists(agfs_client, file_path) - - async def test_rollback_reverse_order(self, agfs_client, test_dir): - """mkdir parent + child → rollback → both removed in reverse order.""" - parent = f"{test_dir}/parent" - child = f"{test_dir}/parent/child" - _mkdir_ok(agfs_client, parent) - _mkdir_ok(agfs_client, child) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True), - UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True), - ] - await execute_rollback(undo_log, agfs_client) - - # child removed first (seq=1), then parent (seq=0) - assert not file_exists(agfs_client, child) - assert not file_exists(agfs_client, parent) - - async def test_rollback_skips_incomplete(self, agfs_client, test_dir): - new_dir = f"{test_dir}/incomplete" - _mkdir_ok(agfs_client, new_dir) - - undo_log = [ - UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False), - ] - await execute_rollback(undo_log, agfs_client) - - # completed=False → not rolled back - assert file_exists(agfs_client, new_dir) - - async def test_rollback_best_effort(self, agfs_client, test_dir): - """A failing rollback entry should not prevent others from running.""" - real_dir = f"{test_dir}/real-dir" - _mkdir_ok(agfs_client, real_dir) - - src = f"{test_dir}/be-src" - dst = f"{test_dir}/be-dst" - _mkdir_ok(agfs_client, dst) - - undo_log = [ - # seq=0: fs_mv rollback will succeed - UndoEntry(sequence=0, op_type="fs_mv", params={"src": src, "dst": dst}, completed=True), - # seq=1: fs_mkdir rollback will fail (rm on non-empty or non-existent path) - UndoEntry( - sequence=1, - op_type="fs_mkdir", - params={"uri": f"{test_dir}/nonexistent-dir-xyz"}, - completed=True, - ), - ] - # Should not raise - await execute_rollback(undo_log, agfs_client) - - # seq=0 mv rollback should have executed (dst → src) - assert file_exists(agfs_client, src) - - async def test_rollback_vectordb_upsert(self, agfs_client, vector_store, request_ctx): - """Real upsert → rollback → record deleted.""" - record_id = str(uuid.uuid4()) - record = { - "id": record_id, - "uri": f"viking://resources/test-upsert-{record_id}.md", - "parent_uri": "viking://resources/", - "account_id": "default", - "context_type": "resource", - "level": 2, - "vector": [0.1] * VECTOR_DIM, - "name": "test", - "description": "test record", - "abstract": "test", - } - await vector_store.upsert(record, ctx=request_ctx) - - # Confirm it exists - results = await vector_store.get([record_id], ctx=request_ctx) - assert len(results) == 1 - - undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_upsert", - params={ - "record_id": record_id, - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client, vector_store=vector_store) - - results = await vector_store.get([record_id], ctx=request_ctx) - assert len(results) == 0 - - async def test_rollback_vectordb_update_uri(self, agfs_client, vector_store, request_ctx): - """Real upsert → update_uri_mapping → rollback → URI restored.""" - record_id = str(uuid.uuid4()) - old_uri = f"viking://resources/old-{record_id}.md" - new_uri = f"viking://resources/new-{record_id}.md" - record = { - "id": record_id, - "uri": old_uri, - "parent_uri": "viking://resources/", - "account_id": "default", - "context_type": "resource", - "level": 2, - "vector": [0.2] * VECTOR_DIM, - "name": "test", - "description": "test", - "abstract": "test", - } - await vector_store.upsert(record, ctx=request_ctx) - - # Forward: update URI mapping - await vector_store.update_uri_mapping( - ctx=request_ctx, - uri=old_uri, - new_uri=new_uri, - new_parent_uri="viking://resources/", - ) - - # Verify forward operation - result = await vector_store.fetch_by_uri(new_uri, ctx=request_ctx) - assert result is not None - - undo_log = [ - UndoEntry( - sequence=0, - op_type="vectordb_update_uri", - params={ - "old_uri": old_uri, - "new_uri": new_uri, - "old_parent_uri": "viking://resources/", - "_ctx_account_id": "default", - "_ctx_user_id": "test_user", - "_ctx_role": "root", - }, - completed=True, - ), - ] - await execute_rollback(undo_log, agfs_client, vector_store=vector_store) - - # URI should be restored to old_uri - result = await vector_store.fetch_by_uri(old_uri, ctx=request_ctx) - assert result is not None From fe33516144494bff5770e320709cc3d90d1ee1e5 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Tue, 17 Mar 2026 20:57:37 +0800 Subject: [PATCH 13/18] fix(transaction): remove checkpoint dead code, fix TOCTOU race, clarify mv lock param - Remove unused _write_checkpoint/_write_checkpoint_async/_read_checkpoint from Session (superseded by redo-log) - Re-resolve URI inside lock in resource_processor Phase 3.5 to prevent concurrent add_resource calls from resolving to the same final_uri - Rename acquire_mv dst_path to dst_parent_path with docstring to clarify that callers pass the destination parent directory --- docs/en/concepts/09-transaction.md | 2 +- docs/zh/concepts/09-transaction.md | 2 +- openviking/core/building_tree.py | 1 + openviking/parse/tree_builder.py | 2 + openviking/session/session.py | 50 ------------------- .../storage/transaction/lock_context.py | 10 ++-- .../storage/transaction/lock_manager.py | 4 +- openviking/storage/transaction/path_lock.py | 29 ++++++++--- openviking/storage/viking_fs.py | 2 +- openviking/utils/resource_processor.py | 8 +++ tests/transaction/test_e2e.py | 2 +- tests/transaction/test_lock_context.py | 2 +- 12 files changed, 46 insertions(+), 68 deletions(-) diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md index 1ada00dc..0e95bdf1 100644 --- a/docs/en/concepts/09-transaction.md +++ b/docs/en/concepts/09-transaction.md @@ -205,7 +205,7 @@ async with LockContext(lock_manager, [path], lock_mode="subtree"): pass # MV lock (move operations) -async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst): +async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst): # Perform operations... pass ``` diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 31d09c54..81d27e02 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -204,7 +204,7 @@ async with LockContext(lock_manager, [path], lock_mode="subtree"): pass # MV 锁(移动操作) -async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst): +async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst): # 执行操作... pass ``` diff --git a/openviking/core/building_tree.py b/openviking/core/building_tree.py index 9685a56d..43207e29 100644 --- a/openviking/core/building_tree.py +++ b/openviking/core/building_tree.py @@ -28,6 +28,7 @@ def __init__( self._contexts: List["Context"] = [] self._uri_map: Dict[str, "Context"] = {} self._root_uri: Optional[str] = None + self._candidate_uri: Optional[str] = None def add_context(self, context: "Context") -> None: """Add a context to the tree.""" diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 5f070cee..deea5efc 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -177,6 +177,8 @@ async def finalize_from_temp( source_format=source_format, ) tree._root_uri = final_uri + if not to_uri: + tree._candidate_uri = candidate_uri # Create a minimal Context object for the root so that tree.root is not None root_context = Context(uri=final_uri, temp_uri=temp_doc_uri) diff --git a/openviking/session/session.py b/openviking/session/session.py index c0f87bd9..bdb6500b 100644 --- a/openviking/session/session.py +++ b/openviking/session/session.py @@ -731,56 +731,6 @@ def _write_relations(self) -> None: except Exception as e: logger.warning(f"Failed to create relation to {usage.uri}: {e}") - def _write_checkpoint(self, data: Dict[str, Any]) -> None: - """Write a commit checkpoint file for crash recovery.""" - if not self._viking_fs: - return - - checkpoint = { - **data, - "session_id": self.session_id, - "compression_index": self._compression.compression_index, - "timestamp": get_current_timestamp(), - } - run_async( - self._viking_fs.write_file( - f"{self._session_uri}/.commit_checkpoint.json", - json.dumps(checkpoint, ensure_ascii=False), - ctx=self.ctx, - ) - ) - - async def _write_checkpoint_async(self, data: Dict[str, Any]) -> None: - """Write a commit checkpoint file for crash recovery (async).""" - if not self._viking_fs: - return - - checkpoint = { - **data, - "session_id": self.session_id, - "compression_index": self._compression.compression_index, - "timestamp": get_current_timestamp(), - } - await self._viking_fs.write_file( - f"{self._session_uri}/.commit_checkpoint.json", - json.dumps(checkpoint, ensure_ascii=False), - ctx=self.ctx, - ) - - def _read_checkpoint(self) -> Optional[Dict[str, Any]]: - """Read commit checkpoint file if it exists.""" - if not self._viking_fs: - return None - try: - content = run_async( - self._viking_fs.read_file( - f"{self._session_uri}/.commit_checkpoint.json", ctx=self.ctx - ) - ) - return json.loads(content) - except Exception: - return None - async def _write_relations_async(self) -> None: """Create relations to used contexts/tools (async).""" if not self._viking_fs: diff --git a/openviking/storage/transaction/lock_context.py b/openviking/storage/transaction/lock_context.py index 62fc15ba..4d1d8443 100644 --- a/openviking/storage/transaction/lock_context.py +++ b/openviking/storage/transaction/lock_context.py @@ -21,13 +21,13 @@ def __init__( lock_manager: LockManager, paths: list[str], lock_mode: str = "point", - mv_dst_path: Optional[str] = None, + mv_dst_parent_path: Optional[str] = None, src_is_dir: bool = True, ): self._manager = lock_manager self._paths = paths self._lock_mode = lock_mode - self._mv_dst_path = mv_dst_path + self._mv_dst_parent_path = mv_dst_parent_path self._src_is_dir = src_is_dir self._handle: Optional[LockHandle] = None @@ -41,12 +41,12 @@ async def __aenter__(self) -> LockHandle: if not success: break elif self._lock_mode == "mv": - if self._mv_dst_path is None: - raise LockAcquisitionError("mv lock mode requires mv_dst_path") + if self._mv_dst_parent_path is None: + raise LockAcquisitionError("mv lock mode requires mv_dst_parent_path") success = await self._manager.acquire_mv( self._handle, self._paths[0], - self._mv_dst_path, + self._mv_dst_parent_path, src_is_dir=self._src_is_dir, ) else: # "point" diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py index 5e2e5076..7e5184ab 100644 --- a/openviking/storage/transaction/lock_manager.py +++ b/openviking/storage/transaction/lock_manager.py @@ -82,13 +82,13 @@ async def acquire_mv( self, handle: LockHandle, src: str, - dst: str, + dst_parent: str, src_is_dir: bool = True, timeout: Optional[float] = None, ) -> bool: return await self._path_lock.acquire_mv( src, - dst, + dst_parent, handle, timeout=timeout if timeout is not None else self._lock_timeout, src_is_dir=src_is_dir, diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index d9212b3b..345f2661 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -328,17 +328,32 @@ async def acquire_subtree(self, path: str, owner: LockOwner, timeout: float = 0. async def acquire_mv( self, src_path: str, - dst_path: str, + dst_parent_path: str, owner: LockOwner, timeout: float = 0.0, src_is_dir: bool = True, ) -> bool: + """Acquire locks for a move operation. + + Args: + src_path: Source path to lock. + dst_parent_path: Parent directory of the destination to lock. + Callers typically pass the destination's parent so that the + lock covers sibling-level conflicts without requiring the + target to exist yet. + owner: Lock owner handle. + timeout: Maximum seconds to wait for each lock. + src_is_dir: Whether the source is a directory (SUBTREE lock) + or a file (POINT lock on parent). + """ if src_is_dir: if not await self.acquire_subtree(src_path, owner, timeout=timeout): logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}") return False - if not await self.acquire_subtree(dst_path, owner, timeout=timeout): - logger.warning(f"[MV] Failed to acquire SUBTREE lock on destination: {dst_path}") + if not await self.acquire_subtree(dst_parent_path, owner, timeout=timeout): + logger.warning( + f"[MV] Failed to acquire SUBTREE lock on destination parent: {dst_parent_path}" + ) await self.release(owner) return False else: @@ -346,12 +361,14 @@ async def acquire_mv( if not await self.acquire_point(src_parent, owner, timeout=timeout): logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}") return False - if not await self.acquire_point(dst_path, owner, timeout=timeout): - logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}") + if not await self.acquire_point(dst_parent_path, owner, timeout=timeout): + logger.warning( + f"[MV] Failed to acquire POINT lock on destination parent: {dst_parent_path}" + ) await self.release(owner) return False - logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}") + logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_parent_path}") return True async def release(self, owner: LockOwner) -> None: diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 72475573..d9119c11 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -357,7 +357,7 @@ async def mv( get_lock_manager(), [old_path], lock_mode="mv", - mv_dst_path=dst_parent, + mv_dst_parent_path=dst_parent, src_is_dir=is_dir, ): uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx) diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index 42ca8752..73bca2f6 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -213,6 +213,7 @@ async def process_resource( # ============ Phase 3.5: 首次添加立即落盘 ============ root_uri = result.get("root_uri") temp_uri = result.get("temp_uri") # temp_doc_uri + candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None if root_uri and temp_uri: viking_fs = get_viking_fs() @@ -230,6 +231,13 @@ async def process_resource( await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx) async with LockContext(get_lock_manager(), [parent_path], lock_mode="point"): + # Re-resolve URI inside lock to prevent TOCTOU race where + # concurrent add_resource calls resolve to the same final_uri. + if candidate_uri: + root_uri = await self.tree_builder._resolve_unique_uri(candidate_uri) + result["root_uri"] = root_uri + dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx) + src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx) await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path) diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py index 1c79414d..2f284f53 100644 --- a/tests/transaction/test_e2e.py +++ b/tests/transaction/test_e2e.py @@ -80,7 +80,7 @@ async def test_mv_lock_acquires_both_paths(self, agfs_client, lock_manager, test agfs_client.mkdir(src) agfs_client.mkdir(dst) - async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst): + async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst): src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}") dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}") src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token diff --git a/tests/transaction/test_lock_context.py b/tests/transaction/test_lock_context.py index 37fcb89c..131fb48e 100644 --- a/tests/transaction/test_lock_context.py +++ b/tests/transaction/test_lock_context.py @@ -67,7 +67,7 @@ async def test_mv_lock(self, agfs_client, lm, test_dir): agfs_client.mkdir(src) agfs_client.mkdir(dst) - async with LockContext(lm, [src], lock_mode="mv", mv_dst_path=dst) as handle: + async with LockContext(lm, [src], lock_mode="mv", mv_dst_parent_path=dst) as handle: assert len(handle.locks) == 2 From 1010bd4e4f4df88be003dcb32ba23e8fa410f771 Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Tue, 17 Mar 2026 21:29:02 +0800 Subject: [PATCH 14/18] fix: path --- .../storage/transaction/lock_manager.py | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py index 7e5184ab..2d8ca809 100644 --- a/openviking/storage/transaction/lock_manager.py +++ b/openviking/storage/transaction/lock_manager.py @@ -125,10 +125,15 @@ async def _recover_pending_redo(self) -> None: logger.error(f"Redo recovery failed for {task_id}: {e}", exc_info=True) async def _redo_session_memory(self, info: Dict[str, Any]) -> None: - """Re-extract memories from archive.""" + """Re-extract memories from archive. + + Lets exceptions from _enqueue_semantic propagate so the caller + can decide whether to mark the redo task as done. + """ from openviking.message import Message from openviking.server.identity import RequestContext, Role from openviking.session.compressor import SessionCompressor + from openviking.storage.viking_fs import get_viking_fs from openviking_cli.session.user_id import UserIdentifier archive_uri = info.get("archive_uri") @@ -139,51 +144,46 @@ async def _redo_session_memory(self, info: Dict[str, Any]) -> None: role_str = info.get("role", "root") if not archive_uri or not session_uri: - logger.warning("Cannot redo session_memory: missing archive_uri or session_uri") - return + raise ValueError("Cannot redo session_memory: missing archive_uri or session_uri") + + # 1. Build request context (needed for path conversion below) + user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id) + ctx = RequestContext(user=user, role=Role(role_str)) - # 1. Read archived messages - messages_path = f"{archive_uri}/messages.jsonl" + # 2. Read archived messages + messages_uri = f"{archive_uri}/messages.jsonl" + viking_fs = get_viking_fs() + agfs_path = viking_fs._uri_to_path(messages_uri, ctx=ctx) + messages = [] try: - agfs_path = messages_path.replace("viking://", "") content = self._agfs.cat(agfs_path) if isinstance(content, bytes): content = content.decode("utf-8") + for line in content.strip().split("\n"): + if line.strip(): + try: + messages.append(Message.from_dict(json.loads(line))) + except Exception: + pass except Exception as e: - logger.warning(f"Cannot read archive for redo: {messages_path}: {e}") - return + logger.warning(f"Cannot read archive for redo: {agfs_path}: {e}") - messages = [] - for line in content.strip().split("\n"): - if line.strip(): - try: - messages.append(Message.from_dict(json.loads(line))) - except Exception: - pass - - if not messages: - logger.warning(f"No messages found in archive for redo: {archive_uri}") - return - - # 2. Build request context - user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id) - ctx = RequestContext(user=user, role=Role(role_str)) - - # 3. Re-extract memories (best-effort: skip if compressor not available) - session_id = session_uri.rstrip("/").rsplit("/", 1)[-1] - try: - compressor = SessionCompressor(vikingdb=None) - memories = await compressor.extract_long_term_memories( - messages=messages, - user=user, - session_id=session_id, - ctx=ctx, - ) - logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}") - except Exception as e: - logger.warning(f"Redo: memory extraction skipped ({e}), will retry via queue") + # 3. Re-extract memories (best-effort, only if archive was readable) + if messages: + session_id = session_uri.rstrip("/").rsplit("/", 1)[-1] + try: + compressor = SessionCompressor(vikingdb=None) + memories = await compressor.extract_long_term_memories( + messages=messages, + user=user, + session_id=session_id, + ctx=ctx, + ) + logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}") + except Exception as e: + logger.warning(f"Redo: memory extraction failed ({e}), falling back to queue") - # 4. Enqueue semantic processing + # 4. Always enqueue semantic processing as fallback await self._enqueue_semantic( uri=session_uri, context_type="memory", From 715739da41983bddf5ec6146a96db43ca4d10b1f Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Tue, 17 Mar 2026 23:53:24 +0800 Subject: [PATCH 15/18] fix: resource lock --- openviking/storage/errors.py | 4 + openviking/storage/queuefs/semantic_dag.py | 91 +++++++++++++---- openviking/storage/queuefs/semantic_msg.py | 4 + .../storage/queuefs/semantic_processor.py | 98 ++++++++----------- .../storage/transaction/lock_manager.py | 6 ++ openviking/storage/transaction/path_lock.py | 13 +++ openviking/storage/viking_fs.py | 18 ++-- openviking/utils/resource_processor.py | 64 +++++++----- openviking/utils/summarizer.py | 2 + 9 files changed, 192 insertions(+), 108 deletions(-) diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py index 010200e7..def786be 100644 --- a/openviking/storage/errors.py +++ b/openviking/storage/errors.py @@ -37,3 +37,7 @@ class LockError(VikingDBException): class LockAcquisitionError(LockError): """Raised when lock acquisition fails.""" + + +class ResourceBusyError(LockError): + """Raised when a resource is locked by an ongoing operation (e.g. semantic processing).""" diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 4ee10a93..d626fa82 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -75,6 +75,7 @@ def __init__( target_uri: Optional[str] = None, semantic_msg_id: Optional[str] = None, recursive: bool = True, + lifecycle_lock_handle_id: str = "", ): self._processor = processor self._context_type = context_type @@ -84,6 +85,7 @@ def __init__( self._target_uri = target_uri self._semantic_msg_id = semantic_msg_id self._recursive = recursive + self._lifecycle_lock_handle_id = lifecycle_lock_handle_id self._llm_sem = asyncio.Semaphore(max_concurrent_llm) self._viking_fs = get_viking_fs() self._nodes: Dict[str, DirNode] = {} @@ -98,6 +100,7 @@ def __init__( self._dir_change_status: Dict[str, bool] = {} self._overview_cache: Dict[str, Dict[str, str]] = {} self._overview_cache_lock = asyncio.Lock() + self._refresh_task: Optional[asyncio.Task] = None def _create_on_complete_callback(self) -> Optional[Callable[[], Awaitable[None]]]: """Create on_complete callback for incremental update or full update.""" @@ -160,10 +163,27 @@ async def run(self, root_uri: str) -> None: """Run DAG execution starting from root_uri.""" self._root_uri = root_uri self._root_done = asyncio.Event() - await self._dispatch_dir(root_uri, parent_uri=None) - await self._root_done.wait() - on_complete = self._create_on_complete_callback() + # Start lifecycle lock refresh loop if we hold a lock + if self._lifecycle_lock_handle_id: + self._refresh_task = asyncio.create_task(self._lock_refresh_loop()) + + try: + await self._dispatch_dir(root_uri, parent_uri=None) + await self._root_done.wait() + except Exception: + await self._release_lifecycle_lock() + raise + + original_on_complete = self._create_on_complete_callback() + + # Wrap on_complete to release lifecycle lock after all processing + async def wrapped_on_complete() -> None: + try: + if original_on_complete: + await original_on_complete() + finally: + await self._release_lifecycle_lock() async with self._vectorize_lock: task_count = self._vectorize_task_count @@ -176,7 +196,7 @@ async def run(self, root_uri: str) -> None: await tracker.register( semantic_msg_id=self._semantic_msg_id, total_count=task_count, - on_complete=on_complete, + on_complete=wrapped_on_complete, metadata={"uri": root_uri}, ) @@ -203,9 +223,10 @@ async def run(self, root_uri: str) -> None: semantic_msg_id=task.semantic_msg_id, ) ) - elif on_complete: + else: + # No vectorize tasks — release lock immediately (via wrapped callback) try: - await on_complete() + await wrapped_on_complete() except Exception as e: logger.error(f"Error in on_complete callback: {e}", exc_info=True) @@ -505,9 +526,6 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]: return results async def _overview_task(self, dir_uri: str) -> None: - from openviking.storage.errors import LockAcquisitionError - from openviking.storage.transaction import LockContext, get_lock_manager - node = self._nodes.get(dir_uri) if not node: return @@ -536,17 +554,12 @@ async def _overview_task(self, dir_uri: str) -> None: abstract = self._processor._extract_abstract_from_overview(overview) overview, abstract = self._processor._enforce_size_limits(overview, abstract) - dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx) + # Write directly — protected by the outer lifecycle SUBTREE lock try: - async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"): - await self._viking_fs.write_file( - f"{dir_uri}/.overview.md", overview, ctx=self._ctx - ) - await self._viking_fs.write_file( - f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx - ) - except LockAcquisitionError: - logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping") + await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx) + await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx) + except Exception: + logger.info(f"[SemanticDag] {dir_uri} write failed, skipping") try: if need_vectorize: @@ -588,6 +601,46 @@ async def _add_vectorize_task(self, task: VectorizeTask) -> None: else: # directory self._vectorize_task_count += 2 + async def _lock_refresh_loop(self) -> None: + """Periodically refresh lifecycle lock to prevent stale expiry.""" + from openviking.storage.transaction import get_lock_manager + + try: + interval = get_lock_manager()._path_lock._lock_expire / 2 + except Exception: + interval = 150.0 + + while True: + try: + await asyncio.sleep(interval) + handle = get_lock_manager().get_handle(self._lifecycle_lock_handle_id) + if handle: + await get_lock_manager().refresh_lock(handle) + else: + break + except asyncio.CancelledError: + break + except Exception as e: + logger.warning(f"[SemanticDag] Lock refresh failed: {e}") + + async def _release_lifecycle_lock(self) -> None: + """Stop refresh loop and release lifecycle lock.""" + if self._refresh_task and not self._refresh_task.done(): + self._refresh_task.cancel() + self._refresh_task = None + if not self._lifecycle_lock_handle_id: + return + handle_id = self._lifecycle_lock_handle_id + self._lifecycle_lock_handle_id = "" + try: + from openviking.storage.transaction import get_lock_manager + + handle = get_lock_manager().get_handle(handle_id) + if handle: + await get_lock_manager().release(handle) + except Exception as e: + logger.warning(f"[SemanticDag] Failed to release lifecycle lock {handle_id}: {e}") + def get_stats(self) -> DagStats: return DagStats( total_nodes=self._stats.total_nodes, diff --git a/openviking/storage/queuefs/semantic_msg.py b/openviking/storage/queuefs/semantic_msg.py index f6acdaf4..720948e8 100644 --- a/openviking/storage/queuefs/semantic_msg.py +++ b/openviking/storage/queuefs/semantic_msg.py @@ -39,6 +39,7 @@ class SemanticMsg: skip_vectorization: bool = False telemetry_id: str = "" target_uri: str = "" + lifecycle_lock_handle_id: str = "" changes: Optional[Dict[str, List[str]]] = ( None # {"added": [...], "modified": [...], "deleted": [...]} ) @@ -55,6 +56,7 @@ def __init__( skip_vectorization: bool = False, telemetry_id: str = "", target_uri: str = "", + lifecycle_lock_handle_id: str = "", changes: Optional[Dict[str, List[str]]] = None, ): self.id = str(uuid4()) @@ -68,6 +70,7 @@ def __init__( self.skip_vectorization = skip_vectorization self.telemetry_id = telemetry_id self.target_uri = target_uri + self.lifecycle_lock_handle_id = lifecycle_lock_handle_id self.changes = changes def to_dict(self) -> Dict[str, Any]: @@ -106,6 +109,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SemanticMsg": skip_vectorization=data.get("skip_vectorization", False), telemetry_id=data.get("telemetry_id", ""), target_uri=data.get("target_uri", ""), + lifecycle_lock_handle_id=data.get("lifecycle_lock_handle_id", ""), changes=data.get("changes"), ) if "id" in data and data["id"]: diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 0db4019b..98d32f7a 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -239,6 +239,14 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, f"Target URI exists, using incremental update: {msg.target_uri}" ) + # Re-acquire lifecycle lock if handle was lost (e.g. server restart) + if msg.lifecycle_lock_handle_id: + lock_uri = msg.target_uri or msg.uri + msg.lifecycle_lock_handle_id = await self._ensure_lifecycle_lock( + msg.lifecycle_lock_handle_id, + viking_fs._uri_to_path(lock_uri, ctx=self._current_ctx), + ) + executor = SemanticDagExecutor( processor=self, context_type=msg.context_type, @@ -248,6 +256,7 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, target_uri=msg.target_uri, semantic_msg_id=msg.id, recursive=msg.recursive, + lifecycle_lock_handle_id=msg.lifecycle_lock_handle_id, ) self._dag_executor = executor await executor.run(msg.uri) @@ -268,6 +277,22 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str, self.report_error(str(e), data) return None finally: + # Safety net: release lifecycle lock if still held (e.g. on exception + # before the DAG executor took ownership) + if msg and msg.lifecycle_lock_handle_id: + try: + from openviking.storage.transaction import get_lock_manager + + lm = get_lock_manager() + handle = lm.get_handle(msg.lifecycle_lock_handle_id) + if handle: + await lm.release(handle) + logger.info( + f"[SemanticProcessor] Safety-net released lifecycle lock " + f"{msg.lifecycle_lock_handle_id}" + ) + except Exception: + pass self._current_msg = None self._current_ctx = None @@ -276,63 +301,24 @@ def get_dag_stats(self) -> Optional["DagStats"]: return None return self._dag_executor.get_stats() - async def _process_single_directory( - self, - uri: str, - context_type: str, - children_uris: List[str], - file_paths: List[str], - ) -> None: - """Process single directory, generate .abstract.md and .overview.md.""" - from openviking.storage.errors import LockAcquisitionError - from openviking.storage.transaction import LockContext, get_lock_manager - - viking_fs = get_viking_fs() - dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx) + @staticmethod + async def _ensure_lifecycle_lock(handle_id: str, lock_path: str) -> str: + """If the handle is missing (server restart), re-acquire a SUBTREE lock. - try: - async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"): - # 1. Collect .abstract.md from subdirectories - children_abstracts = await self._collect_children_abstracts(children_uris) - - # 2. Concurrently generate summaries for files in directory - tasks = [ - self._generate_single_file_summary(fp, ctx=self._current_ctx) - for fp in file_paths - ] - file_summaries = await asyncio.gather(*tasks) - - # 3. Generate .overview.md - overview = await self._generate_overview(uri, file_summaries, children_abstracts) - - # 4. Extract abstract from overview - abstract = self._extract_abstract_from_overview(overview) - - # 5. Write files - await viking_fs.write_file(f"{uri}/.overview.md", overview, ctx=self._current_ctx) - await viking_fs.write_file(f"{uri}/.abstract.md", abstract, ctx=self._current_ctx) - - logger.debug(f"Generated overview and abstract for {uri}") - - # 6. Vectorize directory and files concurrently - vectorize_tasks = [ - self._vectorize_directory_simple(uri, context_type, abstract, overview), - *( - self._vectorize_single_file( - parent_uri=uri, - context_type=context_type, - file_path=fp, - summary_dict=summary, - ) - for fp, summary in zip(file_paths, file_summaries) - ), - ] - results = await asyncio.gather(*vectorize_tasks, return_exceptions=True) - for result in results: - if isinstance(result, Exception): - logger.error(f"Vectorization failed: {result}", exc_info=True) - except LockAcquisitionError: - logger.info(f"[SemanticProcessor] {uri} does not exist or is locked, skipping") + Returns the (possibly new) handle ID, or "" on failure. + """ + from openviking.storage.transaction import get_lock_manager + + lm = get_lock_manager() + if lm.get_handle(handle_id): + return handle_id + new_handle = lm.create_handle() + if await lm.acquire_subtree(new_handle, lock_path): + logger.info(f"Re-acquired lifecycle lock on {lock_path} (handle {new_handle.id})") + return new_handle.id + logger.warning(f"Failed to re-acquire lifecycle lock on {lock_path}") + await lm.release(new_handle) + return "" async def _process_memory_directory(self, msg: SemanticMsg) -> None: """Process a memory directory with special handling. diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py index 2d8ca809..56dedd79 100644 --- a/openviking/storage/transaction/lock_manager.py +++ b/openviking/storage/transaction/lock_manager.py @@ -94,6 +94,12 @@ async def acquire_mv( src_is_dir=src_is_dir, ) + def get_handle(self, handle_id: str) -> Optional[LockHandle]: + return self._handles.get(handle_id) + + async def refresh_lock(self, handle: LockHandle) -> None: + await self._path_lock.refresh(handle) + async def release(self, handle: LockHandle) -> None: await self._path_lock.release(handle) self._handles.pop(handle.id, None) diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py index 345f2661..2aaaecf1 100644 --- a/openviking/storage/transaction/path_lock.py +++ b/openviking/storage/transaction/path_lock.py @@ -371,6 +371,19 @@ async def acquire_mv( logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_parent_path}") return True + async def refresh(self, owner: LockOwner) -> None: + """Rewrite all lock file timestamps to prevent stale cleanup.""" + for lock_path in list(owner.locks): + token = self._read_token(lock_path) + if token: + parsed_owner_id, _, lock_type = _parse_fencing_token(token) + if parsed_owner_id == owner.id: + new_token = _make_fencing_token(owner.id, lock_type) + try: + self._agfs.write(lock_path, new_token.encode("utf-8")) + except Exception as e: + logger.warning(f"Failed to refresh lock {lock_path}: {e}") + async def release(self, owner: LockOwner) -> None: lock_count = len(owner.locks) for lock_path in reversed(owner.locks): diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index d9119c11..214c8c93 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -290,7 +290,10 @@ async def rm( after cleaning up any orphan index records. Acquires a path lock, deletes VectorDB records, then FS files. + Raises ResourceBusyError when the target is locked by an ongoing + operation (e.g. semantic processing). """ + from openviking.storage.errors import LockAcquisitionError, ResourceBusyError from openviking.storage.transaction import LockContext, get_lock_manager self._ensure_access(uri, ctx) @@ -317,12 +320,15 @@ async def rm( lock_paths = [parent] lock_mode = "point" - async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode): - uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) - uris_to_delete.append(target_uri) - await self._delete_from_vector_store(uris_to_delete, ctx=ctx) - result = self.agfs.rm(path, recursive=recursive) - return result + try: + async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode): + uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) + uris_to_delete.append(target_uri) + await self._delete_from_vector_store(uris_to_delete, ctx=ctx) + result = self.agfs.rm(path, recursive=recursive) + return result + except LockAcquisitionError: + raise ResourceBusyError(f"Resource is being processed: {uri}") async def mv( self, diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index 73bca2f6..adcb4245 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -210,29 +210,29 @@ async def process_resource( return result - # ============ Phase 3.5: 首次添加立即落盘 ============ + # ============ Phase 3.5: 首次添加立即落盘 + 生命周期锁 ============ root_uri = result.get("root_uri") temp_uri = result.get("temp_uri") # temp_doc_uri candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None + lifecycle_lock_handle_id = "" if root_uri and temp_uri: + from openviking.storage.transaction import LockContext, get_lock_manager + viking_fs = get_viking_fs() + lock_manager = get_lock_manager() target_exists = await viking_fs.exists(root_uri, ctx=ctx) + if not target_exists: # 第一次添加:锁保护下将 temp 移到 final - from openviking.storage.transaction import LockContext, get_lock_manager - dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx) parent_path = dst_path.rsplit("/", 1)[0] if "/" in dst_path else dst_path - # 确保父目录存在 parent_uri = "/".join(root_uri.rsplit("/", 1)[:-1]) if parent_uri: await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx) - async with LockContext(get_lock_manager(), [parent_path], lock_mode="point"): - # Re-resolve URI inside lock to prevent TOCTOU race where - # concurrent add_resource calls resolve to the same final_uri. + async with LockContext(lock_manager, [parent_path], lock_mode="point"): if candidate_uri: root_uri = await self.tree_builder._resolve_unique_uri(candidate_uri) result["root_uri"] = root_uri @@ -241,48 +241,58 @@ async def process_resource( src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx) await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path) - # 清理 temp 根目录 + # 在 POINT 锁内获取 SUBTREE 锁(消除竞态窗口) + lifecycle_lock_handle_id = await self._try_acquire_lifecycle_lock( + lock_manager, dst_path + ) + try: await viking_fs.delete_temp(parse_result.temp_dir_path, ctx=ctx) except Exception: pass - # 更新 temp_uri → DAG 直接在 final 上跑 result["temp_uri"] = root_uri + else: + # 增量更新:对目标目录加 SUBTREE 锁 + resource_path = viking_fs._uri_to_path(root_uri, ctx=ctx) + lifecycle_lock_handle_id = await self._try_acquire_lifecycle_lock( + lock_manager, resource_path + ) # ============ Phase 4: Optional Steps ============ build_index = kwargs.get("build_index", True) temp_uri_for_summarize = result.get("temp_uri") or parse_result.temp_dir_path - if summarize: - # Explicit summarization request. - # If build_index is ALSO True, we want vectorization. - # If build_index is False, we skip vectorization. + should_summarize = summarize or build_index + if should_summarize: skip_vec = not build_index try: await self._get_summarizer().summarize( resource_uris=[result["root_uri"]], ctx=ctx, skip_vectorization=skip_vec, + lifecycle_lock_handle_id=lifecycle_lock_handle_id, temp_uris=[temp_uri_for_summarize], **kwargs, ) except Exception as e: logger.error(f"Summarization failed: {e}") result["warnings"] = result.get("warnings", []) + [f"Summarization failed: {e}"] + elif lifecycle_lock_handle_id: + # 无下游处理接管锁,主动释放 + from openviking.storage.transaction import get_lock_manager - elif build_index: - # Standard compatibility mode: "Just Index it" usually implies ingestion flow. - # We assume this means "Ingest and Index", which requires summarization. - try: - await self._get_summarizer().summarize( - resource_uris=[result["root_uri"]], - ctx=ctx, - skip_vectorization=False, - temp_uris=[temp_uri_for_summarize], - **kwargs, - ) - except Exception as e: - logger.error(f"Auto-index failed: {e}") - result["warnings"] = result.get("warnings", []) + [f"Auto-index failed: {e}"] + handle = get_lock_manager().get_handle(lifecycle_lock_handle_id) + if handle: + await get_lock_manager().release(handle) return result + + @staticmethod + async def _try_acquire_lifecycle_lock(lock_manager, path: str) -> str: + """尝试获取 SUBTREE 生命周期锁,失败时优雅降级返回空字符串。""" + handle = lock_manager.create_handle() + if await lock_manager.acquire_subtree(handle, path): + return handle.id + logger.warning(f"[ResourceProcessor] Failed to acquire lifecycle lock on {path}") + await lock_manager.release(handle) + return "" diff --git a/openviking/utils/summarizer.py b/openviking/utils/summarizer.py index 36b879e8..e9a1cb20 100644 --- a/openviking/utils/summarizer.py +++ b/openviking/utils/summarizer.py @@ -31,6 +31,7 @@ async def summarize( resource_uris: List[str], ctx: "RequestContext", skip_vectorization: bool = False, + lifecycle_lock_handle_id: str = "", **kwargs, ) -> Dict[str, Any]: """ @@ -72,6 +73,7 @@ async def summarize( skip_vectorization=skip_vectorization, telemetry_id=telemetry.telemetry_id if telemetry.enabled else "", target_uri=uri if uri != temp_uri else None, + lifecycle_lock_handle_id=lifecycle_lock_handle_id, ) await semantic_queue.enqueue(msg) enqueued_count += 1 From 5de24ab79b89c657c7ca2efcbc37a92b331d5aec Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Wed, 18 Mar 2026 00:24:53 +0800 Subject: [PATCH 16/18] fix: test --- openviking/storage/transaction/__init__.py | 2 ++ openviking/storage/transaction/lock_manager.py | 8 ++++++++ openviking/storage/viking_fs.py | 6 +++--- tests/client/test_file_operations.py | 6 +++++- tests/client/test_import_export.py | 2 ++ tests/integration/test_full_workflow.py | 2 ++ 6 files changed, 22 insertions(+), 4 deletions(-) diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py index 0fca8816..52e77ff7 100644 --- a/openviking/storage/transaction/__init__.py +++ b/openviking/storage/transaction/__init__.py @@ -12,6 +12,7 @@ LockManager, get_lock_manager, init_lock_manager, + release_all_locks, reset_lock_manager, ) from openviking.storage.transaction.path_lock import PathLock @@ -26,5 +27,6 @@ "RedoLog", "get_lock_manager", "init_lock_manager", + "release_all_locks", "reset_lock_manager", ] diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py index 56dedd79..d20d2ef8 100644 --- a/openviking/storage/transaction/lock_manager.py +++ b/openviking/storage/transaction/lock_manager.py @@ -251,3 +251,11 @@ def get_lock_manager() -> LockManager: def reset_lock_manager() -> None: global _lock_manager _lock_manager = None + + +async def release_all_locks() -> None: + """Release all active lock handles. **Test-only utility.**""" + if _lock_manager is None: + return + for handle in list(_lock_manager.get_active_handles().values()): + await _lock_manager.release(handle) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 214c8c93..f8b8a356 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -1016,20 +1016,20 @@ def _uri_to_path(self, uri: str, ctx: Optional[RequestContext] = None) -> str: safe_parts = [self._shorten_component(p, self._MAX_FILENAME_BYTES) for p in parts] return f"/local/{account_id}/{'/'.join(safe_parts)}" - _INTERNAL_DIRS = {"_system"} + _INTERNAL_NAMES = {"_system", ".path.ovlock"} _ROOT_PATH = "/local" def _ls_entries(self, path: str) -> List[Dict[str, Any]]: """List directory entries, filtering out internal directories. At account root (/local/{account}), uses VALID_SCOPES whitelist. - At other levels, uses _INTERNAL_DIRS blacklist. + At other levels, uses _INTERNAL_NAMES blacklist. """ entries = self.agfs.ls(path) parts = [p for p in path.strip("/").split("/") if p] if len(parts) == 2 and parts[0] == "local": return [e for e in entries if e.get("name") in VikingURI.VALID_SCOPES] - return [e for e in entries if e.get("name") not in self._INTERNAL_DIRS] + return [e for e in entries if e.get("name") not in self._INTERNAL_NAMES] def _path_to_uri(self, path: str, ctx: Optional[RequestContext] = None) -> str: """/local/{account}/... -> viking://... diff --git a/tests/client/test_file_operations.py b/tests/client/test_file_operations.py index 99415f20..a402e4af 100644 --- a/tests/client/test_file_operations.py +++ b/tests/client/test_file_operations.py @@ -8,6 +8,7 @@ import pytest from openviking import AsyncOpenViking +from openviking.storage.transaction import release_all_locks class TestRm: @@ -22,6 +23,7 @@ async def test_rm_file(self, client: AsyncOpenViking, sample_markdown_file: Path reason="Test rm", ) + await release_all_locks() uris = await client.tree(result["root_uri"]) for data in uris: if not data["isDir"]: @@ -35,7 +37,8 @@ async def test_rm_directory_recursive(self, client: AsyncOpenViking, sample_dire for f in sample_directory.glob("**/*.txt"): await client.add_resource(path=str(f), reason="Test rm dir") - # Get resource directory + # Release lifecycle locks held by add_resource before rm + await release_all_locks() entries = await client.ls("viking://resources/") for data in entries: if data["isDir"]: @@ -57,6 +60,7 @@ async def test_mv_file(self, client: AsyncOpenViking, sample_markdown_file: Path ) uri = result["root_uri"] new_uri = "viking://resources/moved/" + await release_all_locks() await client.mv(uri, new_uri) # Verify original location does not exist with pytest.raises(Exception): # noqa: B017 diff --git a/tests/client/test_import_export.py b/tests/client/test_import_export.py index e4dfe3a9..2aaac8f7 100644 --- a/tests/client/test_import_export.py +++ b/tests/client/test_import_export.py @@ -10,6 +10,7 @@ import pytest from openviking import AsyncOpenViking +from openviking.storage.transaction import release_all_locks class TestExportOvpack: @@ -99,6 +100,7 @@ async def test_import_export_roundtrip( await client.export_ovpack(original_uri, str(export_path)) # Delete original resource + await release_all_locks() await client.rm(original_uri, recursive=True) # Import diff --git a/tests/integration/test_full_workflow.py b/tests/integration/test_full_workflow.py index 823cefd7..b48385d7 100644 --- a/tests/integration/test_full_workflow.py +++ b/tests/integration/test_full_workflow.py @@ -10,6 +10,7 @@ from openviking import AsyncOpenViking from openviking.message import TextPart +from openviking.storage.transaction import release_all_locks @pytest_asyncio.fixture(scope="function") @@ -171,6 +172,7 @@ async def test_export_import_roundtrip( assert export_path.exists() # 4. Delete original resource + await release_all_locks() await client.rm(original_uri, recursive=True) # 5. Import From 08c36731f9e975b3ad581a9cad2048a6079981ba Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Wed, 18 Mar 2026 01:44:45 +0800 Subject: [PATCH 17/18] docs: update --- docs/en/concepts/09-transaction.md | 25 ++++++++++++++++++------- docs/zh/concepts/09-transaction.md | 25 ++++++++++++++++++------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md index 0e95bdf1..edbda724 100644 --- a/docs/en/concepts/09-transaction.md +++ b/docs/en/concepts/09-transaction.md @@ -131,27 +131,37 @@ Operation flow: | Problem | Solution | |---------|----------| | File moved from temp to final directory, then crash -> file exists but never searchable | Two separate paths for first-time add vs incremental update | +| Resource already on disk but rm deletes it while semantic processing / vectorization is still running -> wasted work | Lifecycle SUBTREE lock held from finalization through processing completion | **First-time add** (target does not exist) — handled in `ResourceProcessor.process_resource` Phase 3.5: ``` -1. Acquire lock on parent_path of final_uri (lock_mode="point") +1. Acquire POINT lock on parent of final_uri 2. agfs.mv temp directory -> final location -3. Release lock -4. Clean up temp directory -5. Enqueue SemanticMsg -> DAG runs on final +3. Acquire SUBTREE lock on final_uri (inside POINT lock, eliminating race window) +4. Release POINT lock +5. Clean up temp directory +6. Enqueue SemanticMsg(lifecycle_lock_handle_id=...) -> DAG runs on final +7. DAG starts lock refresh loop (refreshes timestamp every lock_expire/2 seconds) +8. DAG complete + all embeddings done -> release SUBTREE lock ``` +During this period, `rm` attempting to acquire a SUBTREE lock on the same path will fail with `ResourceBusyError`. + **Incremental update** (target already exists) — temp stays in place: ``` -1. Enqueue SemanticMsg(uri=temp, target_uri=final) -> DAG runs on temp -2. DAG completion triggers sync_diff_callback or move_temp_to_target_callback -3. Each VikingFS.rm / VikingFS.mv inside callbacks acquires its own lock +1. Acquire SUBTREE lock on target_uri (protect existing resource) +2. Enqueue SemanticMsg(uri=temp, target_uri=final, lifecycle_lock_handle_id=...) +3. DAG runs on temp, lock refresh loop active +4. DAG completion triggers sync_diff_callback or move_temp_to_target_callback +5. Callback completes -> release SUBTREE lock ``` Note: DAG callbacks do NOT wrap operations in an outer lock. Each `VikingFS.rm` and `VikingFS.mv` has its own lock internally. An outer lock would conflict with these inner locks causing deadlock. +**Server restart recovery**: SemanticMsg is persisted in QueueFS. On restart, `SemanticProcessor` detects that the `lifecycle_lock_handle_id` handle is missing from the in-memory LockManager and re-acquires a SUBTREE lock. + ### session.commit() | Problem | Solution | @@ -316,6 +326,7 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError | Failure scenario | Defense | Recovery timing | |-----------------|--------|-----------------| | Crash during operation | Lock auto-expires + stale detection | Next acquisition of same path lock | +| Crash during add_resource semantic processing | Lifecycle lock expires + SemanticProcessor re-acquires on restart | Worker restart | | Crash during session.commit Phase 2 | RedoLog marker + redo | On restart | | Crash after enqueue, before worker | QueueFS SQLite persistence | Worker restart | | Orphan index | L2 on-demand load cleanup | When user accesses | diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md index 81d27e02..45a10d63 100644 --- a/docs/zh/concepts/09-transaction.md +++ b/docs/zh/concepts/09-transaction.md @@ -130,27 +130,37 @@ VectorDB 删除失败 -> 直接抛异常,锁自动释放,文件和索引都 | 问题 | 方案 | |------|------| | 文件从临时目录移到正式目录后崩溃 -> 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 | +| 资源已落盘但语义处理/向量化还在跑时被 rm 删除 -> 处理白跑 | 生命周期 SUBTREE 锁,从落盘持续到处理完成 | **首次添加**(target 不存在)— 在 `ResourceProcessor.process_resource` Phase 3.5 中处理: ``` -1. 获取锁,锁 final_uri 的父目录(lock_mode="point") +1. 获取 POINT 锁,锁 final_uri 的父目录 2. agfs.mv 临时目录 -> 正式位置 -3. 释放锁 -4. 清理临时目录 -5. 入队 SemanticMsg -> DAG 在 final 上跑 +3. 获取 SUBTREE 锁,锁 final_uri(在 POINT 锁内,消除竞态窗口) +4. 释放 POINT 锁 +5. 清理临时目录 +6. 入队 SemanticMsg(lifecycle_lock_handle_id=...) -> DAG 在 final 上跑 +7. DAG 启动锁刷新循环(每 lock_expire/2 秒刷新时间戳) +8. DAG 完成 + 所有 embedding 完成 -> 释放 SUBTREE 锁 ``` +此期间 `rm` 尝试获取同路径 SUBTREE 锁会失败,抛出 `ResourceBusyError`。 + **增量更新**(target 已存在)— temp 保持不动: ``` -1. 入队 SemanticMsg(uri=temp, target_uri=final) -> DAG 在 temp 上跑 -2. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback -3. callback 内的每个 VikingFS.rm / VikingFS.mv 各自独立加锁 +1. 获取 SUBTREE 锁,锁 target_uri(保护已有资源) +2. 入队 SemanticMsg(uri=temp, target_uri=final, lifecycle_lock_handle_id=...) +3. DAG 在 temp 上跑,启动锁刷新循环 +4. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback +5. callback 执行完毕 -> 释放 SUBTREE 锁 ``` 注意:DAG callback 不在外层加锁。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立锁保护。外层锁会与内部锁冲突导致死锁。 +**服务重启恢复**:SemanticMsg 持久化在 QueueFS 中。重启后 `SemanticProcessor` 发现 `lifecycle_lock_handle_id` 对应的 handle 不在内存中,会重新获取 SUBTREE 锁。 + ### session.commit() | 问题 | 方案 | @@ -315,6 +325,7 @@ async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=d | 异常场景 | 防线 | 恢复时机 | |---------|------|---------| | 操作中途崩溃 | 锁自动过期 + stale 检测 | 下次获取同路径锁时 | +| add_resource 语义处理中途崩溃 | 生命周期锁过期 + SemanticProcessor 重启时重新获取 | worker 重启后 | | session.commit Phase 2 崩溃 | RedoLog 标记 + 重做 | 重启时 | | enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后 | | 孤儿索引 | L2 按需加载时清理 | 用户访问时 | From cdd222b262378e39cfab0daa43f3e12e92ddfc5b Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Wed, 18 Mar 2026 14:23:42 +0800 Subject: [PATCH 18/18] fix: tests --- openviking/server/app.py | 5 +++-- openviking/storage/transaction/lock_manager.py | 4 +++- tests/misc/test_vikingdb_observer.py | 16 +++++++++++----- .../test_hierarchical_retriever_rerank.py | 2 +- tests/server/conftest.py | 6 +++--- tests/server/test_api_search.py | 2 +- tests/session/test_memory_dedup_actions.py | 2 +- 7 files changed, 23 insertions(+), 14 deletions(-) diff --git a/openviking/server/app.py b/openviking/server/app.py index c22794e1..c553c6ff 100644 --- a/openviking/server/app.py +++ b/openviking/server/app.py @@ -59,7 +59,8 @@ def create_app( async def lifespan(app: FastAPI): """Application lifespan handler.""" nonlocal service - if service is None: + owns_service = service is None + if owns_service: service = OpenVikingService() await service.initialize() logger.info("OpenVikingService initialized") @@ -93,7 +94,7 @@ async def lifespan(app: FastAPI): # Cleanup task_tracker.stop_cleanup_loop() - if service: + if owns_service and service: await service.close() logger.info("OpenVikingService closed") diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py index d20d2ef8..2fec7e42 100644 --- a/openviking/storage/transaction/lock_manager.py +++ b/openviking/storage/transaction/lock_manager.py @@ -52,9 +52,11 @@ async def stop(self) -> None: if self._cleanup_task: self._cleanup_task.cancel() try: - await self._cleanup_task + if self._cleanup_task.get_loop() is asyncio.get_running_loop(): + await self._cleanup_task except asyncio.CancelledError: pass + self._cleanup_task = None for handle in list(self._handles.values()): await self._path_lock.release(handle) self._handles.clear() diff --git a/tests/misc/test_vikingdb_observer.py b/tests/misc/test_vikingdb_observer.py index 310d01b6..3dc3cfaf 100644 --- a/tests/misc/test_vikingdb_observer.py +++ b/tests/misc/test_vikingdb_observer.py @@ -8,13 +8,16 @@ import asyncio import openviking as ov +from openviking.async_client import AsyncOpenViking async def test_vikingdb_observer(): """Test VikingDBObserver functionality""" print("=== Test VikingDBObserver ===") - # Create client + # Reset singleton to ensure clean state from previous tests + await AsyncOpenViking.reset() + client = ov.AsyncOpenViking(path="./test_data/test_vikingdb_observer") try: @@ -72,15 +75,17 @@ async def test_vikingdb_observer(): traceback.print_exc() finally: - # Close client - await client.close() + await AsyncOpenViking.reset() print("Client closed") -def test_sync_client(): +async def test_sync_client(): """Test sync client""" print("\n=== Test sync client ===") + # Reset singleton to ensure clean state from previous tests + await AsyncOpenViking.reset() + client = ov.OpenViking(path="./test_data/test_vikingdb_observer") try: @@ -109,6 +114,7 @@ def test_sync_client(): finally: client.close() + await AsyncOpenViking.reset() print("Sync client closed") @@ -117,4 +123,4 @@ def test_sync_client(): asyncio.run(test_vikingdb_observer()) # Run sync test - test_sync_client() + asyncio.run(test_sync_client()) diff --git a/tests/retrieve/test_hierarchical_retriever_rerank.py b/tests/retrieve/test_hierarchical_retriever_rerank.py index f72682b3..a7ead7bc 100644 --- a/tests/retrieve/test_hierarchical_retriever_rerank.py +++ b/tests/retrieve/test_hierarchical_retriever_rerank.py @@ -19,7 +19,7 @@ def __init__(self) -> None: class DummyEmbedder: - def embed(self, _query: str) -> DummyEmbedResult: + def embed(self, _query: str, is_query: bool = False) -> DummyEmbedResult: return DummyEmbedResult() diff --git a/tests/server/conftest.py b/tests/server/conftest.py index 98cf606f..3bc0e40f 100644 --- a/tests/server/conftest.py +++ b/tests/server/conftest.py @@ -56,11 +56,11 @@ class FakeEmbedder(DenseEmbedderBase): def __init__(self): super().__init__(model_name="test-fake-embedder") - def embed(self, text: str) -> EmbedResult: + def embed(self, text: str, is_query: bool = False) -> EmbedResult: return EmbedResult(dense_vector=[0.1] * dimension) - def embed_batch(self, texts: list[str]) -> list[EmbedResult]: - return [self.embed(text) for text in texts] + def embed_batch(self, texts: list[str], is_query: bool = False) -> list[EmbedResult]: + return [self.embed(text, is_query=is_query) for text in texts] def get_dimension(self) -> int: return dimension diff --git a/tests/server/test_api_search.py b/tests/server/test_api_search.py index 05d313fb..ce33773c 100644 --- a/tests/server/test_api_search.py +++ b/tests/server/test_api_search.py @@ -12,7 +12,7 @@ @pytest.fixture(autouse=True) def fake_query_embedder(service): class FakeEmbedder: - def embed(self, text: str) -> EmbedResult: + def embed(self, text: str, is_query: bool = False) -> EmbedResult: return EmbedResult(dense_vector=[0.1, 0.2, 0.3]) service.viking_fs.query_embedder = FakeEmbedder() diff --git a/tests/session/test_memory_dedup_actions.py b/tests/session/test_memory_dedup_actions.py index 52c445cc..ac273965 100644 --- a/tests/session/test_memory_dedup_actions.py +++ b/tests/session/test_memory_dedup_actions.py @@ -42,7 +42,7 @@ def __init__(self, dense_vector): class _DummyEmbedder: - def embed(self, _text): + def embed(self, _text, is_query: bool = False): return _DummyEmbedResult([0.1, 0.2, 0.3])